#!/usr/bin/env python # Jacob Joseph # 10 December 2009 # Build a table of taxonomy identifiers, scientific names, and common # names from the NCBI taxonomy database. import os, tarfile, urllib, csv from DurandDB import blastq class tax: def __init__(self, tax_id, scientific_name=None, common_names = None): self.tax_id = tax_id self.scientific_name = scientific_name if common_names is None: common_names = [] self.common_names = common_names def tup(self): return (self.tax_id, self.scientific_name, self.common_names) def __repr__(self): return str(self.tup()) class tax_insert: def __init__(self): self.bq = blastq.blastq() self.archive = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz" self.fname = "names.dmp" def parse_names(self): urlfd = urllib.urlopen( self.archive) #urlfd = open(os.path.expandvars("$HOME/tmp/taxdump.tar.gz"),'r') tar = tarfile.open( fileobj = urlfd, mode="r|gz") # workaround odd behavior in tarfile: # http://mail.python.org/pipermail/tutor/2007-March/053136.html for tarinfo in tar: if tarinfo.name == self.fname: fd = tar.extractfile( tarinfo) break lines = csv.reader(fd, delimiter = "|") tax_dict = {} for row in lines: row = [a.strip() for a in row] (tax_id, name, unique_name, name_class, foo) = row tax_id = int(tax_id) if name_class not in ("common name", "scientific name"): continue if not tax_id in tax_dict: tax_dict[tax_id] = tax(tax_id) if name_class == "scientific name": assert tax_dict[tax_id].scientific_name is None, "%s\n%s\%s" % ( "Duplicate scientific name!", row, tax_dict[tax_id]) tax_dict[tax_id].scientific_name = name elif name_class == "common name": tax_dict[tax_id].common_names.append(name) return tax_dict def insert_table(self): i = """INSERT INTO taxon (tax_id, name, common_names) VALUES (%s, %s, %s)""" tax_dict = self.parse_names() for t in tax_dict.itervalues(): self.bq.dbw.execute(i, t.tup()) self.bq.dbw.commit() return if __name__ == "__main__": t = tax_insert() #tax_dict = t.parse_names() t.insert_table()