#!/usr/bin/env python # Jacob Joseph # 19 June 2010 # Basis for exploring a clustering in varying ways, particularly as # HTML. For example, for hierarchical clustering, facilitate # generation of a "cluster browser", annotated by network statistics, # and family annotations. from JJcluster.cluster_sql import flatcluster, hcluster from DurandDB import familyq, blastq from JJnetstat.stathelper import nxstat from JJutil import pickler, rate import logging, os, numpy import networkx as NX class describe(object): def __init__(self, cluster_run_id, clustering_type, cacheq = False, pickledir = None, family_set_name = None, debug_level = None, # default is warning and above, # or whatever the root logger # was configured to have # elsewhere ): self.cacheq=cacheq if pickledir != None: self.pickledir = pickledir else: self.pickledir = os.path.expandvars('$HOME/tmp/pickles') self.log = logging.getLogger('JJcluster.describe') if debug_level is not None: logging.getLogger('').setLevel(debug_level) if clustering_type == 'hierarchical': self.CR = hcluster(cluster_run_id = cluster_run_id, cacheq = self.cacheq) else: self.CR = flatcluster(cluster_run_id = cluster_run_id, cacheq = self.cacheq) # reading queries from the disk is often slower than the database if family_set_name is not None: self.fq = familyq.familyq(family_set_name = family_set_name, cacheq=False) else: # FIXME: this will break the html building, where they depend on family functions self.fq = blastq.blastq(cacheq=False) def cluster_conductance(self, cluster_id): """Calculate the conductance of a cluster.""" cluster_seq_set = self.CR.fetch_cluster(cluster_id) full_seq_set = self.fq.fetch_seq_set(br_id=self.CR.br_id, nc_id=self.CR.nc_id, set_id=self.CR.set_id_filter) self.log.debug("Calculating conductance: %d (size: %d)" % ( cluster_id,len(cluster_seq_set))) boundary_sum = 0.0 a_s = 0.0 a_not_s = 0.0 # count edge boundary for query_seq in cluster_seq_set: d = self.fq.fetch_hits_dictarray( stype=self.CR.stype, br_id=self.CR.br_id, nc_id=self.CR.nc_id, symmetric=self.CR.symmetric, query_seq_id = query_seq, seq_id_0_set_id = self.CR.set_id_filter, # potentially faster if specified set_id_filter = self.CR.set_id_filter, self_hits=False) (hit_list, score_list) = d[query_seq] a_s += numpy.sum(score_list) for i,seq_id_1 in enumerate(hit_list): if seq_id_1 not in cluster_seq_set: boundary_sum += score_list[i] # FIXME: what do we do about having a symmetric matrix? for query_seq in (full_seq_set - cluster_seq_set): d = self.fq.fetch_hits_dictarray( stype=self.CR.stype, br_id=self.CR.br_id, nc_id=self.CR.nc_id, symmetric=self.CR.symmetric, query_seq_id = query_seq, seq_id_0_set_id = self.CR.set_id_filter, # potentially faster if specified set_id_filter = self.CR.set_id_filter, self_hits=False) # a sequence may have no hits if not query_seq in d: continue (hit_list, score_list) = d[query_seq] a_not_s += numpy.sum(score_list) if a_not_s > a_s: break conductance = (boundary_sum / a_s) if a_s > 0 else -1 self.log.debug("Conductance: %g, boundary: %g, a_s: %g, minimum a_not_s: %g" % ( conductance, boundary_sum, a_s, a_not_s)) return conductance def fetch_cluster_hits(self, cluster_id, stype=None): """Fetch the sequence (sub)network for all nodes in cluster_id.""" seq_set = self.CR.fetch_cluster(cluster_id) # Default to the stype of the clustering, but it may be useful # to be able to extract bit-scores after clustering with NC # scores if stype is None: stype = self.CR.stype self.log.debug("Fetching cluster: %d (stype: %s)" % (cluster_id, stype)) hit_dict = {} qrate = rate.rate(totcount=len(seq_set)) for query_seq in seq_set: qrate.increment() if qrate.count % 1000 == 0: self.log.info("%d %s", (cluster_id, qrate)) d = self.fq.fetch_hits_dictarray( stype=stype, br_id=self.CR.br_id, nc_id=self.CR.nc_id, symmetric=self.CR.symmetric, query_seq_id = query_seq, seq_id_0_set_id = self.CR.set_id_filter, # potentially faster if specified seq_set = seq_set, self_hits=False) assert len(d) <= 1, "d has more than one key for query_seq %s" % query_seq if query_seq in d: hit_dict[query_seq] = d[query_seq] return (hit_dict, seq_set) def network_stats(self, cluster_id, hit_dict=None): if self.cacheq: retval = pickler.cachefn(pickledir = self.pickledir, args = "%s_%s" % (self.CR.cr_id, cluster_id)) if retval is not None: return retval if hit_dict is None: hit_dict = self.fetch_cluster_hits( cluster_id) self.log.debug("Calculating network stats: %d (size %d)", (cluster_id,len(hit_dict))) # build a networkx graph of all hits in the cluster G = NX.Graph() # add all edges G.add_edges_from( ( (node, nbr) for (node,(hit_list, score_list)) in hit_dict.iteritems() for nbr in hit_list )) net = nxstat(G=G, cacheq=True, unique_parameters="%s_%s" % (self.CR.cr_id, cluster_id)) net_stats = net.calc_statistics(omit_spl=True) if self.cacheq: retval = pickler.cachefn(pickledir = self.pickledir, args = "%s_%s" % (self.CR.cr_id, cluster_id), retval = net_stats) return net_stats def cluster_stats(self, cluster_id, hit_dict=None, seq_set=None, stype=None): if self.cacheq: retval = pickler.cachefn(pickledir = self.pickledir, args = "%s_%s_%s_%s" % (self.CR.cr_id, self.CR.set_id_filter, cluster_id, stype)) if retval is not None: return retval if hit_dict is None: hit_dict, seq_set = self.fetch_cluster_hits( cluster_id, stype=stype) # FIXME: Not currently symmetric. Repetitive, and likely # quite slow for very many clusters. Some values could be # calculated by traversing the tree. num_scores = 0 all_scores = numpy.empty([10], dtype=numpy.float64) # build an array of all scores within this set for (hit_list, score_list) in hit_dict.values(): upper_ind = num_scores + len(score_list) if len(all_scores) < upper_ind: all_scores.resize( [int(upper_ind*1.5)]) all_scores[num_scores:upper_ind] = score_list num_scores = upper_ind all_scores.resize([num_scores]) #self.log.debug("cluster_stats: %d %s (len %d)" % (cluster_id, all_scores, num_scores)) stats = {} stats['num_nodes'] = len(seq_set) stats['num_edges'] = len(all_scores) stats['frac_edges'] = float(stats['num_edges']) / ( stats['num_nodes'] * (stats['num_nodes'] -1)) if num_scores > 0 else -1 stats['min'] = numpy.min( all_scores) if num_scores > 0 else -1 stats['max'] = numpy.max( all_scores) if num_scores > 0 else -1 stats['mean'] = numpy.mean( all_scores) if num_scores > 0 else -1 stats['stdev'] = numpy.std( all_scores) if num_scores > 0 else -1 stats['density'] = stats['mean'] * len(all_scores) / ( len(seq_set) * (len(seq_set) -1)) if num_scores > 0 else -1 if self.cacheq: retval = pickler.cachefn(pickledir = self.pickledir, args = "%s_%s_%s" % (self.CR.cr_id, self.CR.set_id_filter, cluster_id), retval = stats) return stats def html_run_header(self, descr="", orgarg=""): s_table = """
Cluster Run: | %(cr_id)d %(cr_comment)s %(cr_params)s |
Score Type: | %(stype)s |
Database Set: | %(set_id)d (%(num_seqs)d sequences) %(set_name)s %(set_descr)s |
Clustering Query Set: | %(q_set_id)d (%(q_num_seqs)d sequences) %(q_set_name)s %(q_set_descr)s |
Blast Run: | %(br_id)d Query set %(blast_query_set)d |
NC Run: | %(nc_id)s |
Background argument: | %(orgarg)s |
%(descr)s |