#!/usr/bin/env python
# Jacob Joseph
# 11 Dec 2009
# Build an HTML page to navigate hierarchical clusters
import sys, cProfile, time
from JJcluster.cluster_obj import cobj
from JJcluster.describe import describe
class browser(describe):
def __init__(self, cluster_run_id,
cacheq=False,
family_set_name=None):
describe.__init__(self, cluster_run_id = cluster_run_id,
clustering_type = 'hierarchical',
cacheq = cacheq,
family_set_name = family_set_name)
def html_hierarchy(self, orgarg="", family_abbrev=None):
# constrain left and right to the extents of the family in the
# tree.
if family_abbrev is not None:
family_members = self.fq.fetch_family_seqs( family_abbrev)
parent_id = self.CR.get_common_parent( seq_ids=family_members)
parent_row = self.CR.get_cluster_row( parent_id)
#parent_row = self.CR.get_cluster_row( parent_row['parent_id'])
#parent_row = self.CR.get_cluster_row( parent_row['parent_id'])
#parent_row = self.CR.get_cluster_row( parent_row['parent_id'])
#parent_row = self.CR.get_cluster_row( parent_row['parent_id'])
print "Common parent:", parent_id, parent_row
print "Nodes: %d" % (parent_row['rgt']-parent_row['lft'],)
root = self.CR.fetch_structure( left_lim=parent_row['lft'],
right_lim=parent_row['rgt'])
else:
root = self.CR.fetch_structure()
s = """
\n"""
s += self.html_run_header(orgarg=orgarg)
s_prefix_sequence = """%(level)0.3d
P
"""
s_cluster = """%(level)0.3d
P
L R
Cluster %(cluster_id)d:
Cluster Similarity: %(clustsim)0.4f,
Size: %(num_nodes)d,
Density: %(density)0.4f,
J: %(J)0.4f,
Edges: %(num_edges)d,
Frac. Edges: %(frac_edges)0.4f,
Mean: %(mean)0.4f(%(stdev)0.4f)
\n"""
s_cluster_large = """%(level)0.3d
P L R
Cluster %(cluster_id)d: Cluster Similarity: %(clustsim)0.4f, Size: %(num_nodes)d, J: %(J)0.4f
\n"""
# work around emacs lingering highlighting of quote in triple quotes "
# (family_set, family_member) = (self.fq.family_sets,
# self.fq.family_members)
# stack of (level, cluster) tuples, where level is the amount
# of indentation needed
queue = []
queue.append( (0, root, None, None))
while len(queue) > 0:
(level, clust, parent_size, parent_id) = queue.pop()
#print clust.cluster_id()
# We're at a leaf cluster
if 1 == clust.right() - clust.left():
seq_id = clust.items()[0]
s += s_prefix_sequence % {'level': level,
'space': level*4 + 35,
'parent_id': parent_id}
s += self.html_sequence( seq_id)
else:
cluster_id = clust.cluster_id()
#hit_dict, seq_set = self.fetch_cluster_hits(cluster_id)
#edge_stats = self.cluster_stats(cluster_id, hit_dict=hit_dict)
cluster_size = len( self.CR.fetch_cluster( cluster_id))
# Add children to the queue
children = clust.items()
# order so that smaller clusters (esp singletons) come first (i.e. last on the queue)
children.sort(key=lambda a: a.right()-a.left(), reverse=True)
for child in children:
if isinstance(child, cobj):
queue.append( (level + 1, child, cluster_size, cluster_id))
if parent_size is None: parent_size = cluster_size
# edge statistics take a while to calculate, so
# calculate them only for smaller clusters
if cluster_size > 1000:
s += s_cluster_large % {'level': level,
'space': level*4,
'cluster_id': cluster_id,
'clustsim': 1-clust.distance(),
'num_nodes': cluster_size,
'J': float(parent_size - cluster_size) / cluster_size,
'left_id': clust.items()[0].cluster_id(),
'right_id': clust.items()[1].cluster_id(),
'parent_id': parent_id,
'cr_id': self.CR.cr_id,
'orgarg': orgarg
}
else:
edge_stats = self.cluster_stats(cluster_id)
s += s_cluster % {'level': level,
'space': level*4,
'cluster_id': cluster_id,
'clustsim': 1-clust.distance(),
'num_nodes': cluster_size,
'num_edges': edge_stats['num_edges'] / 2,
'frac_edges': edge_stats['frac_edges'],
'density': edge_stats['density'],
'mean': edge_stats['mean'],
'stdev': edge_stats['stdev'],
'J': float(parent_size - cluster_size) / cluster_size,
'left_id': clust.items()[0].cluster_id(),
'right_id': clust.items()[1].cluster_id(),
'parent_id': parent_id,
'cr_id': self.CR.cr_id,
'orgarg': orgarg
}
s += "\n"
return s
class runparam:
def __init__(self, br_id=None, nc_id=None,
stype=None, set_id=None):
self.br_id = br_id
self.nc_id = nc_id
self.stype = stype
self.set_id = set_id
if __name__ == "__main__":
cr_id = int(sys.argv[1])
set_id = int(sys.argv[2])
family_set_name = sys.argv[3]
family_abbrev = sys.argv[4] if len(sys.argv) == 5 else None # used to select a subtree
date = time.strftime('%Y%m%d')
# FIXME: workaround for not storing br_id, nc_id, set_id in a
# queryable field
# cr_id_map = {
# 70: runparam(100, 746, 'nc_score', 105), # full set
# 71: runparam(97, 750, 'nc_score', 105), # cluster
# 72: runparam(97, 746, 'nc_score', 107), # full set blast, new jan10 set
# 73: runparam(104, 777, 'nc_score', None), # 12 species, not symmetric
# 74: runparam(104, 777, 'nc_score', 109), # human and mouse only, not symmetric
# 75: runparam(104, 779, 'nc_score', ), # 12 species, symmetric
# 76: runparam(105, 780, 'nc_score', ), # 48 species, not symmetric
# 77: runparam(104, 779, 'nc_score', 109), # Human and mouse only. Symmetric
# 78: runparam(104, 779, 'nc_score', 109), # Human and mouse only. Symmetric. Single linkage
# 79: runparam(104, 779, 'nc_score', 111), # Yeast only. Symmetric
# 80: runparam(104, 779, 'nc_score', 111), # Yeast only. Symmetric. Single linkage
# 81: runparam(104, 779, 'nc_score', 112), # Human only. Symmetric.
# 82: runparam(105, 781, 'nc_score', 112), # Human only. Symmetric. Not compositional
# }
b = browser(cluster_run_id = cr_id,
family_set_name=family_set_name,
cacheq=True)
s = b.html_hierarchy(orgarg="&o=h",
family_abbrev=family_abbrev)
fd = open("figures/%s_browser_cr_id_%d_set_id_%d_%s_%s.html" % (
date, cr_id, set_id, family_set_name, family_abbrev),
'w')
fd.write(s)
fd.close()