#!/usr/bin/env python

# Jacob Joseph
# 10 December 2009

# Build a table of taxonomy identifiers, scientific names, and common
# names from the NCBI taxonomy database.

import os, tarfile, urllib, csv
from DurandDB import blastq

class tax:
    def __init__(self, tax_id, scientific_name=None, common_names = None):
        self.tax_id = tax_id
        self.scientific_name = scientific_name

        if common_names is None: common_names = []
        self.common_names = common_names

    def tup(self):
        return (self.tax_id, self.scientific_name, self.common_names)

    def __repr__(self):
        return str(self.tup())

class tax_insert:

    def __init__(self):
        self.bq = blastq.blastq()

        self.archive = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
        self.fname = "names.dmp"
        
    def parse_names(self):

        urlfd = urllib.urlopen( self.archive)
        #urlfd = open(os.path.expandvars("$HOME/tmp/taxdump.tar.gz"),'r')
        tar = tarfile.open( fileobj = urlfd, mode="r|gz")

        # workaround odd behavior in tarfile:
        # http://mail.python.org/pipermail/tutor/2007-March/053136.html
        for tarinfo in tar:
            if tarinfo.name == self.fname:
                fd = tar.extractfile( tarinfo)
                break

        
        lines = csv.reader(fd, delimiter = "|")

        tax_dict = {}
        for row in lines:
            row = [a.strip() for a in row]
            (tax_id, name, unique_name, name_class, foo) = row
            tax_id = int(tax_id)

            if name_class not in ("common name", "scientific name"):
                continue

            if not tax_id in tax_dict:
                tax_dict[tax_id] = tax(tax_id)

            if name_class == "scientific name":
                assert tax_dict[tax_id].scientific_name is None, "%s\n%s\%s" % (
                    "Duplicate scientific name!", row, tax_dict[tax_id])

                tax_dict[tax_id].scientific_name = name

            elif name_class == "common name":
                tax_dict[tax_id].common_names.append(name)

        return tax_dict


    def insert_table(self):

        i = """INSERT INTO taxon
        (tax_id, name, common_names)
        VALUES
        (%s, %s, %s)"""

        tax_dict = self.parse_names()

        for t in tax_dict.itervalues():
            self.bq.dbw.execute(i, t.tup())
            
        self.bq.dbw.commit()
        return

if __name__ == "__main__":
    t = tax_insert()
        
    #tax_dict = t.parse_names()
    t.insert_table()
