#!/usr/bin/env python

# Jacob Joseph
# 29 Sept 2008

# Facilitate integration of arbitrary FASTA files to the DurandLab2
# database.

import time, getopt, sys
from Bio import SeqIO
from insert_generic import insert_generic
from JJutil import crc64

class insert_fasta(insert_generic):

    def __init__(self, source_name, source_version, fname,
                 source_ver_id=None, custom_name_parser=None,
                 gb_tax_id = None):
        insert_generic.__init__(self)

        self.source_name = source_name
        self.source_info[source_name] = {
            'version': source_version,
            'date': time.strftime('%Y-%m-%d %H:%M:%S') }

        # insert (or lookup) a prot seq source version
        if source_ver_id is not None:
            self.source_info[self.source_name]['source_ver_id']= source_ver_id
        else:
            self.lookup_prot_seq_source_ver(
                self.source_name,
                self.source_info[self.source_name]['version'],
                self.source_info[self.source_name]['date'])

        m = "*** Inserting sequences with source_ver_id='%d' ***"
        print m % self.source_info[self.source_name]['source_ver_id']

        if custom_name_parser == 'ppod_species':
            self.parse_name_descr = self.parse_name_descr_ppod
        elif custom_name_parser == 'ppod_species_uniprot':
            self.parse_name_descr = self.parse_name_descr_ppod_withuniprot
        elif custom_name_parser is not None:
            assert False, "Unknown custom_name_parser: %s" % custom_name_parser
        else:
            self.parse_name_descr = self.parse_name_descr_basic

        self.parse_fasta( fname, gb_tax_id)

    def parse_name_descr_ppod(self, record):
        # e.g., >Arabidopsis thaliana|TAIR:gene:1005027766
        l = record.description
        larr = l.split('|')
        name = larr[1]
        descr = larr[0]
        
        return (name, descr)

    def parse_name_descr_ppod_withuniprot(self, record):
        # e.g.,
        # >AQUAE|ENTREZ:1193999|NCBI:NP_214325
        # >BACTN|ENTREZ:1073534|NCBI:NP_809639
        # >BRAJA|ENTREZ:1053640|NCBI:NP_768268
        # >CAEBR|ENTREZ:5633788|NCBI:XP_001671128
        # >CHICK|ENTREZ:422364|NCBI:XP_420335
        # >ENTHI|ENTREZ:3404274|NCBI:XP_649976
        # >LEIMA|ENTREZ:5648894|NCBI:XP_001687509
        # >NEUCR|ENTREZ:3873340|NCBI:XP_957188
        # >PLAYO|ENTREZ:3806726|NCBI:XP_729427
        # >RAT|RGD:1311090|NCBI:XP_001057505
        # >STRPU|ENTREZ:755434|NCBI:XP_001189843
        # >TETTH|ENTREZ:4505939|NCBI:XP_001015290

        # >ARATH|TAIR:locus:2078426|NCBI:NP_191191.2
        # >BACSU|ENTREZ:939141|NCBI:NP_390014
        # >DEIRA|ENTREZ:1798069|NCBI:NP_296367
        # >GLOVI|ENTREZ:2600530|NCBI:NP_925075
        # >LEPIN|ENTREZ:1152095|NCBI:NP_712933
        # >METAC|ENTREZ:1473979|NCBI:NP_617010
        # >MOUSE|MGI:MGI:3649014|NCBI:NP_001030076
        # >ORYSJ|ENTREZ:3131451|NCBI:NP_039365
        # >STRCO|ENTREZ:1099793|NCBI:NP_628523
        # >SULSO|ENTREZ:1455383|NCBI:NP_341783
        # >THEMA|ENTREZ:897476|NCBI:NP_228761
        # >YEAST|SGD:S000028557|NCBI:NP_878108
        
        # >GEOSL|ENTREZ:2808007|NCBI:YP_016331

        # >DICDI|dictyBase:DDB_G0273975|NCBI:EAL70421.1
        
        # >BOVIN|ENSEMBL:ENSBTAG00000036184|ENSEMBL:ENSBTAP00000047346
        # >PANTR|ENSEMBL:ENSPTRG00000031326|ENSEMBL:ENSPTRP00000053827
        # >CIOIN|ENSEMBL:ENSCING00000016250|ENSEMBL:ENSCINP00000028259
        # >XENTR|ENSEMBL:ENSXETG00000027244|ENSEMBL:ENSXETP00000056950
        # >FUGRU|ENSEMBL:ENSTRUG00000004123|ENSEMBL:ENSTRUP00000009764
        # >CANFA|ENSEMBL:ENSCAFG00000009450|ENSEMBL:ENSCAFP00000013904
        # >MACMU|ENSEMBL:ENSMMUG00000029023|ENSEMBL:ENSMMUP00000032206
        # >ANOGA|ENSEMBL:AGAP000001|ENSEMBL:AGAP000001-PA
        # >MONDO|ENSEMBL:ENSMODG00000025437|ENSEMBL:ENSMODP00000008116
        # >ORNAN|ENSEMBL:ENSOANG00000008004|ENSEMBL:ENSOANP00000012730
        # >DANRE|ENSEMBL:ENSDARG00000041705|ENSEMBL:ENSDARP00000088909
        
        # >ASHGO|ENTREZ:4619397|UniProtKB:Q75CP6
        # >EMENI|ENTREZ:2876911|UniProtKB:P05147
        # >CHLTA|ENTREZ:3687317|UniProtKB:Q3KM97
        # >CHLAA|ENTREZ:5825221|UniProtKB:A9WKH6
        # >ECOLI|ECOLI:EG10986-MONOMER|UniProtKB:P05100
        # >DROME|FB:FBgn0010339|UniProtKB:P32234
        # >CHLRE|ENTREZ:5728351|UniProtKB:P52908
        # >HUMAN|ENSEMBL:ENSG00000166913|UniProtKB:P31946
        # >SCHPO|GeneDB_Spombe:SPAC922.03|UniProtKB:Q9URX3
        # >PSEA7|ENTREZ:5357561|UniProtKB:A6V1E9
        # >CAEEL|WB:WBGene00003920|UniProtKB:P41932

        l = record.description
        larr = l.split('|')

        if larr[2].startswith('UniProtKB:'):
            name = larr[2][10:]
        elif larr[1].startswith('MGI'):
            name = larr[1][4:]
        else:
            name = larr[1]

        descr = l
        
        return (name, descr)

    def parse_name_descr_basic(self, record):
        return (record.name, record.description)

    def parse_fasta(self, fname, gb_tax_id=None):
        source_ver_id = self.source_info[self.source_name]['source_ver_id']
        
        fd = open( fname, 'rU')
        for record in SeqIO.parse( fd, 'fasta'):
            # Some fasta files come in nonstandard name formats. Parse
            # them with custom functions
            (name, descr) = self.parse_name_descr( record)
            #print name, descr
            self.insert_sequence( source_ver_id, name, descr,
                                  record.seq.tostring(),
                                  gb_tax_id)

    def compare_prot_seq( self, seq_id, primary_acc, description,
                          sequence, seq_length, seq_crc, seq_mw,
                          gb_tax_id):
        
        if not seq_id == self.compare_fa_descr( seq_id, description):
            m = "Sequence '%s' already exists with seq_id '%d'." % (
                primary_acc, seq_id)
            m += "\nHowever, the FASTA description differs: '%s' != '%s'" % (
                description, self.fetch_fa_descr( seq_id))

            #assert False, m
            print "IGNORING ENTRY:", m
            return seq_id

        if not seq_id == self.compare_prot_seq_str(
            seq_id, sequence, seq_length, seq_crc, seq_mw):
            m = "Sequence '%s' already exists with seq_id '%d'." % (
                primary_acc, seq_id)
            m += "\nHowever, the sequence tuple differs: '%s' != '%s'" % (
                (sequence, seq_len, seq_crc, seq_mw),
                self.fetch_prot_seq_str( seq_id))
            assert False, m
            # return None

        if not seq_id == self.compare_gb_tax_id( seq_id, gb_tax_id):
            m = "Sequence '%s' already exists with seq_id '%d'." % (
                primary_acc, seq_id)
            m += "\nHowever, the taxonomy id differs: '%d' != '%d'" % (
                self.fetch_tax_id(seq_id), gb_tax_id)
            assert False, m

        return seq_id
    
    def insert_sequence(self, source_ver_id, primary_acc,
                        description, sequence, gb_tax_id):

        seq_length = len(sequence)
        seq_crc = crc64.CRC64digest(sequence)
        seq_mw = 0 # dummy molecular weight--a required field
        
        # insert only if this prot_seq_version entry does not already
        # exist.  Allows one to resume runs in the event of database
        # issues, program crashes
        q = """SELECT seq_id FROM prot_seq_version
        WHERE source_ver_id = %(source_ver_id)s
        AND primary_acc = %(primary_acc)s"""

        seq_id = self.dbw.fetchsingle(q, locals())

        if seq_id is not None and seq_id == self.compare_prot_seq(
            seq_id, primary_acc, description, sequence, seq_length,
            seq_crc, seq_mw, gb_tax_id):
            # already inserted
            pass

        else:
            # create a new seq_id. Dummy molecular weight
            seq_id = self.insert_prot_seq( sequence,
                                           seq_length,
                                           seq_crc,
                                           seq_mw)

            self.insert_fa_descr(seq_id, description)

            if gb_tax_id is not None:
                self.insert_gb_tax_id(seq_id, gb_tax_id)

            self.insert_prot_seq_version_simple( seq_id, source_ver_id,
                                                 primary_acc)

            self.dbw.commit()

        return


    def insert_fa_descr(self, seq_id, description):
        i = """INSERT INTO fa_descr (seq_id, descr)
        VALUES (%(seq_id)s, %(description)s)"""

        self.dbw.execute(i, locals())
        return
    
    
    def fetch_fa_descr(self, seq_id):
        q = """SELECT descr FROM fa_descr
        WHERE seq_id = %(seq_id)s"""

        fa_desc = self.dbw.fetchsingle(q, locals())

        return fa_desc

    def compare_fa_descr(self, seq_id, description):
        fa_desc = self.fetch_fa_descr( seq_id)

        if fa_desc == description:
            return seq_id
        else:
            return None

    def insert_gb_tax_id(self, seq_id, tax_id):
        i = """INSERT INTO prot_seq_taxonomy (seq_id, tax_id)
        VALUES (%(seq_id)s, %(tax_id)s)"""

        self.dbw.execute(i, locals())
        return
        
    def fetch_gb_tax_id(self, seq_id):
        q = """SELECT tax_id FROM prot_seq_taxonomy
        WHERE seq_id = %(seq_id)s"""
        return self.dbw.fetchsingle(q, locals())
        
    def compare_gb_tax_id(self, seq_id, tax_id):
        gb_tax_id = self.fetch_gb_tax_id( seq_id)

        if gb_tax_id == tax_id:
            return seq_id
        else:
            return None
        
def usage():
    s = """DurandLab2 FASTA file inserter

Usage:

    'insert_fasta.py -f <fasta_file> [options]'

Options:
    -f, --fasta <name>                 (Required)
        Path and filename of the FASTA file to insert.

    -i, --source_ver_id <int>          (Optional)

    -n, --source_name <string>         (Required if no '-i')

    -v, --source_version <string>      (Required if no '-i']

    -g, --gb_tax_id <int>              (Optional)
        Label all sequences as having a specific taxonomy identifier.
        This is helpful when the input is split into separate files by
        species.
 
    -c, --custom_name_parser <string>  (Optional)
        Select a custom parser for the FASTA name and description
        fields.  E.g., 'ppod_species' or 'ppod_species_uniprot'.
         
If an existing source_ver_id is not specified, 'source_name' and
'source_version' are required.

"""
    return s


def main():
    try:
        opts, args = getopt.getopt( sys.argv[1:], 'n:v:f:hi:c:g:',
                                    ['fasta=', 'source_ver_id=',
                                     'source_name=', 'source_version=',
                                     'help', 'custom_name_parser=',
                                     'gb_tax_id='])
    except getopt.GetoptError:
        print usage()
        raise

    help = False
    source_name = None
    source_version = None
    fasta_file = None
    source_ver_id = None
    custom_name_parser = None
    gb_tax_id = None
    for o, a in opts:
        if o in ('-h', '--help'):
            help = True
        if o in ('-n','--source_name'): source_name = a
        if o in ('-v','--source_version'): source_version = a
        if o in ('-f','--fasta'): fasta_file = a
        if o in ('-i','--source_ver_id'): source_ver_id = int(a)
        if o in ('-c','--custom_name_parser'): custom_name_parser = a
        if o in ('-g','--gb_tax_id'): gb_tax_id = int(a)

    if help or fasta_file is None:
        print usage()
        sys.exit()

    if source_ver_id is not None:
        pass
    elif (source_name is not None
          and source_version is not None):
        pass
    else:
        print "Error: Unable to execute with given parameters:"
        print "fasta_file:", fasta_file
        print "source_name:", source_name
        print "source_version:", source_version
        print "sourve_ver_id:", source_ver_id
        print "custom_name_parser:", custom_name_parser
        print "gb_tax_id:", gb_tax_id
        print usage()
        sys.exit()

    print "Inserting file:", fasta_file
    print "As source '%s', Version '%s'" % (source_name, source_version)

    
    in_fasta = insert_fasta( source_name, source_version,
                             fasta_file, source_ver_id=source_ver_id,
                             custom_name_parser = custom_name_parser,
                             gb_tax_id = gb_tax_id)
    return in_fasta
    
if __name__ == "__main__":
    in_fasta = main()
