stable version

61dbb71b · Castillo · d414ea78 · 61dbb71b · 61dbb71b · 61dbb71b
Commit 61dbb71b authored Nov 18, 2016 by Castillo
Showing with 294 additions and 124 deletions

PhyloSpark.py PhyloSpark.py +134 -85

output_2json.py output_2json.py +17 -7

phylum_srv.py phylum_srv.py +36 -12

phylomain.js static/phylomain.js +64 -12

test.py test.py +43 -8

No files found.
--- a/PhyloSpark.py
+++ b/PhyloSpark.py
@@ -10,6 +10,9 @@ import os
 import datetime
 import pprint
 import itertools
+import time
+from itertools import combinations
+import networkx as nx

 def lineal_comparisons( years ):
 	D = {}
@@ -113,7 +116,11 @@ class Period:

 		# Psub = P.join( WL ).map(lambda x: (x[0],x[1][0]))
 		T = P.join( WL ).map(lambda x: x[1][0] )
+
+		t_i = time.time() ##
 		self.P_c = T.count()
+		t_f = time.time() ##
+		T_t = "{0:.3f}".format((t_f - t_i)) +"[s]" ##

 		# # saving term frequencies
 		# self.TF = T.flatMap(lambda xs: [x for x in xs]).map(lambda x: (x, 1))
@@ -127,6 +134,8 @@ class Period:
 		# - - - - - - - - - - - - - - - - - - - - - # 
 		# self.FI = model.freqItemsets().filter(lambda x: len(x.items)>=minfsetsize and x.freq>=2)
 		# .sortBy(lambda x: x.freq , ascending=False).zipWithIndex().map( lambda x: ( x[1] , x[0][0] , x[0][1]  )  ).persist()
+
+		t_i = time.time() ##
 		self.FI = model.freqItemsets().filter(lambda x: len(x.items)>=minfsetsize and x.freq>=2)
 		self.FI = self.FI.sortBy(lambda x: x.freq , ascending=False).zipWithIndex().map( lambda x : (x[1],x[0]) ).persist()
 		#.filter(lambda x: x[1]<=100).map( lambda x: ( x[1] , x[0][0] , x[0][1]  )  ).persist()
@@ -140,6 +149,10 @@ class Period:
 		# # # # = = [ / Extracting Frequent Itemsets ]  = = = # # # 

 		self.FI_c = self.FI.count()
+
+		t_f = time.time() ##
+		FI_t = "{0:.3f}".format((t_f - t_i)) +"[s]" ##
+
 		# # print("")
 		# # print("")
 		# # print("----FI----",year)
@@ -174,7 +187,8 @@ class Period:
 		# 	print( i )

 		# print( T.take(3) )
-		print( "\t\t\t|FIs|"," -> ",self.FI_c )
+		print( "\t\t\t|T|"," -> ",self.P_c , "\t",T_t )
+		print( "\t\t\t|FI|"," -> ",self.FI_c , "\t",FI_t )
 		print("")
 		return self.P_c , self.FI_c

@@ -195,6 +209,7 @@ class Phylo:
 			"from_": { "type": int , "value": -1 },
 			"to_": { "type": int , "value": -1 },
 			"minfpgsupp": { "type": float , "value": 0.0001 },
+			"minfsetsupp": { "type": int , "value": 2 },
 			"minfsetsize": { "type": int , "value": 4 },
 			"minsetdistance": { "type": int , "value": 0 },
 			"mram": { "type": int , "value": 40 },
@@ -235,7 +250,11 @@ class Phylo:
 		N = self.yearsD
 		K = self.minK

+		if len(WL)>0:
 			WL = self.sc.parallelize( WL ).map( lambda x: (int(x) , 1) )
+		else:
+			WL = self.WL
+
 		Distribution = {}
 		for y in t :
 			period_ = Period( some_sc=self.sc , period=y , numpart=self.partitions )
@@ -276,8 +295,10 @@ class Phylo:
 					}


-	def get_opossites( self , found_distances ):
+	def get_opossites( self , found_distances=[] , filter_s = {} ):

+		print( "AAAAAAH" )
+		print( filter_s )
 		data = {}

 		Nodes = {}
@@ -304,8 +325,14 @@ class Phylo:

 		for y in period_nodes:
 			clusters = self.sc.parallelize( period_nodes[y] )
-			R = self.yearsD[y].FI.join( clusters ).map(lambda x : [ x[0] , list(x[1][0].items) , x[1][0].freq ] ).collect()
-			for i in R:
+			R = self.yearsD[y].FI.join( clusters ).map(lambda x : [x[0] , list(x[1][0].items) , x[1][0].freq ] )
+			if "minfsetsupp" in filter_s:
+				R = R.filter( lambda x : x[2]>=filter_s["minfsetsupp"] )
+			if "minfsetsize" in filter_s:
+				R = R.filter( lambda x : len(x[1])>=filter_s["minfsetsize"] )
+			RR = R.collect()
+			# pprint.pprint( RR )
+			for i in RR:
 				cID = str(y)+"c"+str(i[0])
 				if cID not in data:
 					data[ cID ] = {
@@ -318,7 +345,10 @@ class Phylo:



-	def filter_jaccard(self , jacc_min ):
+	def filter_jaccard(self , filter_s = {} ):
+
+		jacc_min = filter_s["jacc_min"]
+		f__ = filter_s

 		# print("\tin filter_jaccard!!")
 		rname = datetime.datetime.now().isoformat()+""
@@ -338,17 +368,17 @@ class Phylo:
 		print( "\t",jacc_min,"-> |JACCARD|:",len(found_distances) )

 		timerange = [ 1982 , 2014 ]
-		phylojson = lll.export_phylo( liens=found_distances , T=timerange , jacc_min=jacc_min )
+		phylojson, Parents = lll.export_phylo( liens=found_distances , T=timerange , jacc_min=jacc_min )

-		nodes_md = self.get_opossites( found_distances )
+		nodes_md = self.get_opossites( found_distances , filter_s )

 		nB2A = {}
 		nA2B = {}
 		NodesD_i2s = {}
 		NodesD_s2i = {}
 		NodesC = 0
-		for IDA_o in nodes_md:

+		for IDA_o in sorted(nodes_md.keys()):
 			IDA_s = "A_"+str(IDA_o)
 			if IDA_s not in NodesD_s2i:
 				NodesC += 1
@@ -361,6 +391,7 @@ class Phylo:


 			items_ = {}
+			# print( IDA_o )
 			for ii in nodes_md[IDA_o]["items"]:
 				IDB_s = "B_"+str( ii )
 				if IDB_s not in NodesD_s2i:
@@ -373,7 +404,8 @@ class Phylo:
 					NodesD_s2i[ IDB_s ] = NodesC
 				IDB_i =  NodesD_s2i[ IDB_s ]
 				items_[ IDB_i ] = True
-			
+			# print("\t",sorted(items_))
+			# print("")
 			nA2B[ NodesD_s2i[ IDA_s ] ] = items_

 			for i in items_:
@@ -381,6 +413,54 @@ class Phylo:
 					nB2A[i] =  {}
 				nB2A[i][ NodesD_s2i[ IDA_s ] ] = True

+
+
+		# ETAGES = {}
+		# print("")
+		# print("PARENTS!!:")
+		# for p in sorted( Parents.keys() ):
+		# 	TO_MERGE = nx.Graph()
+		# 	p_items = sorted(nA2B[ NodesD_s2i[ "A_"+str(p) ] ])
+		# 	print(p ,":", p_items)
+		# 	p_children = sorted(Parents[p])
+		# 	p_children_D = {}
+		# 	for j in p_children:
+		# 		child_items = nA2B[ NodesD_s2i[ "A_"+str(j) ] ]
+		# 		print("\t",j ,":", sorted(child_items) )
+		# 		if j in Parents:
+		# 			j_children = Parents[j]
+
+		# 			if j not in p_children_D:
+		# 				p_children_D[ j ] = set( j_children )
+
+		# 	if len(p_children_D)>0:
+		# 		for i in p_children_D:
+		# 			TO_MERGE.add_node( i )
+		# 		p_j_children_pairs = combinations(p_children_D.keys(), 2)
+		# 		for cc in p_j_children_pairs:
+		# 			CID1 = cc[0]
+		# 			CID2 = cc[1]
+		# 			if p_children_D[ CID1 ] == p_children_D[ CID2 ]:
+		# 				print( "\t\t\tsame content:",CID1,CID2 )
+		# 				TO_MERGE.add_edge( CID1 , CID2 ) 
+		# 		print("")
+		# 		h = nx.connected_components(TO_MERGE)  
+		# 		for ss in h:
+		# 			if len(ss)>1:
+		# 				print("\t\t\t",ss)
+		# 				# print(ss)
+		# 				merge_this = {}
+		# 				for ss_i in ss:
+		# 					print("\t\t\t\t",ss_i)
+		# 					# merge_this.union( p_children_D[ ss_i ] )
+		# 					elems = p_children_D[ ss_i ]
+		# 					for ll in elems:
+		# 						merge_this[ ll ] = True
+		# 				merge_this = set(merge_this.keys())
+		# 				print("\t\t\t",merge_this )
+		# 			print("")
+		# 		print("")
+
 		from n_partite_graph import nPartiteGraph
 		bg = nPartiteGraph()
 		graph_b = bg.BiGraph_2( nA2B , nB2A )
@@ -443,6 +523,8 @@ class Phylo:
 			s = "A_"+str(s_)
 			t = "A_"+str(t_)
 			# print( NodesD_s2i[ s ] ,"->", NodesD_s2i[ t ] )
+			if "hidden" not in i:
+				if s in NodesD_s2i and t in NodesD_s2i:
 					ID_s = NodesD_s2i[ s ]
 					ID_t = NodesD_s2i[ t ]

@@ -470,12 +552,17 @@ class Phylo:
 		for cID in phylojson["nodes"]:
 			ID_s = "A_"+str(cID)
 			try:
+				if ID_s in NodesD_s2i:
 					ID_i = NodesD_s2i[ ID_s ]
 					# print( cID ,":",ID_i )

 					node_ = phylojson["nodes"][cID]
 					node_["id"] = ID_i 
 					node_["label"] = cID
+					if cID in nodes_md :
+						if "supp" in nodes_md[cID]:
+							node_["supp"] = nodes_md[cID]["supp"]
+
 					# node_["shape"] = "square" 
 					# node_["type"] = "Cluster" 
 					# 	"x":float(coord[0]) , 
@@ -498,40 +585,6 @@ class Phylo:
 				xxx = 10
 				# a year-node

-
-		# # return { "nodes": [] , "links": [] }
-
-
-		# # 		# print("")
-		# # print(" - - - - - -")
-		# # print("")
-
-		# # for i in graphArray["links"]:
-		# # 	print("_ ",i)
-		# # 	print("_ ","")
-
-
-		# # for i in graphArray["nodes"]:
-		# # 	print( i)
-		# # 	print( "")
-		# # print( " - - - - - - - - -")
-		# # for i in graphArray["links"]:
-		# # 	print( i)
-		# # 	print( "")
-
-		# Nodes_DD = {}
-		# for i in Nodes:
-		# 	print("_ ",i["id"])
-		# 	Nodes_DD[ i["id"] ] = i
-		# # 	print("_ ","")
-		# print("_ "," - - - - - - - - -")
-		# for i in Links:
-		# 	print("_ ",i["s"] ,"->", i["t"] )
-		# 	print( Nodes_DD[ i["s"] ] )
-		# 	print( Nodes_DD[ i["t"] ] )
-		# 	print("")
-		# print("_ "," - - - - - - - - -")
-
 		# # print( "|V_phy|:", len(phylojson["nodes"]))
 		# # print( "|E_phy|:", len(phylojson["links"]))
 		# # print( "|V|:", len(graphArray["nodes"]))
@@ -601,17 +654,13 @@ class Phylo:

 			return { "diff_time": { "Distribution": Distribution , "years":nyears , "pairs":pairs , "pairsD":pairsD } }

-
-
-
-
 		# print("")
 		# print( "old jacc:", self.minjacc )
 		# print( "new jacc:", p_["minjaccard"]  )
 		# phylojson = self.filter_jaccard ( p_["minjaccard"] )
 		# print("")

-		return None
+		return {}




--- a/output_2json.py
+++ b/output_2json.py
@@ -168,9 +168,9 @@ class PhyloMaker:
 		for y in years:
 			AG.add_node(str(y), label=y , fake=True ,shape="plaintext")

-		for i in range(len(years)):
+		for i in sorted(years):
 			try:
-				AG.add_edge(str(years[i]),str(years[i+1]),fake=True)
+				AG.add_edge(str(i),str(i+1),fake=True,weight=1)
 			except:
 				pass
 		 # - - - - - [ / Adding yearly-graph ] - - - - - #
@@ -226,8 +226,18 @@ class PhyloMaker:


 		# redundant_ = nx.DiGraph()
-		# for n in AG.nodes_iter():
-		# 	node = AG.node[n]
+		Parents = {  }
+		for n in AG.nodes_iter():
+			node = AG.node[n]
+			if "fake" not in node:
+				succesors = AG.neighbors( n )
+				if len( succesors )>0:
+					Parents[n] = sorted( succesors )
+					# print( n )
+					# for j in succesors:
+					# 	print( "\t",j )
+					# print("- - - - ")
+					# print("")
 		# 	if "fake" not in node:
 		# 		parents = AG.predecessors( n )
 		# 		if len(parents)>=2:
@@ -350,9 +360,9 @@ class PhyloMaker:
 		for e in B.edges_iter():
 			s = e[0]
 			t = e[1]
-			# if "fake" not in AG[s][t]:
-			# print(e)
 			infodict = {"s":s , "t":t , "w":AG[s][t]["weight"] , "type":"line" }
+			if "fake" in AG[s][t]:
+				infodict["hidden"] = True
 			EdgesDict.append(infodict)

 		Graph = {  
@@ -366,4 +376,4 @@ class PhyloMaker:
 		end = time.time()
 		print(float("{0:.2f}".format(end - start)),"[s] : dot layout FIN")

-		return Graph
+		return Graph, Parents
--- a/phylum_srv.py
+++ b/phylum_srv.py
@@ -111,7 +111,7 @@ def close_contexts():
 def test_post():
 	pprint.pprint( request )
 	query = "void"
-	GG = False
+	GG = { "nodes": [] , "links": [] }
 	stats = False
 	records = { "Count": 0 }
 	if request.method == "POST":
@@ -218,9 +218,15 @@ def test_post():
 			
 			# # pairs of years to be multiplied
 			I[ sID ].temp_matching(  thepairs = pairs )
-			GG = I[ sID ].filter_jaccard ( jacc_min=minjaccard )
+			filters_ = {
+				"jacc_min": I[ sID ].p["minsetdistance"]["value"],
+				"minfsetsize": I[ sID ].p["minfsetsize"]["value"],
+				"minfsetsupp": I[ sID ].p["minfsetsupp"]["value"],
+			}
+			GG = I[ sID ].filter_jaccard ( filter_s=filters_ )


+			if len( GG["links"] )>0:
 				Ya = p_["from_"]
 				Yb = p_["to_"]
 				GG_v = str( len( GG["nodes"] ) )
@@ -236,6 +242,7 @@ def test_post():
 			pprint.pprint( I[ sID ].p )
 			print( "" )
 			params_ = {}
+			# # Updating   I[ sID ].p   parameters with new ones # #
 			for k in p_:
 				if "scontext"!=k:
 					try:
@@ -258,8 +265,25 @@ def test_post():
 				I[ sID ].pairsD = diff_ress["diff_time"]["pairsD"]

 				I[ sID ].temp_matching(  thepairs = diff_ress["diff_time"]["pairs"] )
-				GG = I[ sID ].filter_jaccard ( jacc_min=I[ sID ].p["minsetdistance"]["value"] )
+				filters_ = {
+					"jacc_min": I[ sID ].p["minsetdistance"]["value"],
+					"minfsetsize": I[ sID ].p["minfsetsize"]["value"],
+					"minfsetsupp": I[ sID ].p["minfsetsupp"]["value"],
+				}
+				GG = I[ sID ].filter_jaccard ( filter_s=filters_ )
+			else:
+				pairs = I[ sID ].pairs
+				I[ sID ].temp_matching(  thepairs = pairs )
+				filters_ = {
+					"jacc_min": I[ sID ].p["minsetdistance"]["value"],
+					"minfsetsize": I[ sID ].p["minfsetsize"]["value"],
+					"minfsetsupp": I[ sID ].p["minfsetsupp"]["value"],
+				}
+				GG = I[ sID ].filter_jaccard ( filter_s=filters_ )
+
+

+			if len( GG["links"] )>0:
 				Ya = str(I[ sID ].p["from_"]["value"])
 				Yb = str(I[ sID ].p["to_"]["value"])
 				GG_v = str( len( GG["nodes"] ) )

--- a/static/phylomain.js
+++ b/static/phylomain.js
@@ -126,13 +126,21 @@ function dict_diff(obj1, obj2) {
 }


+
+
+
+
 var K_i2s = {}
+var K_oi2i = {}
 // var K_s2i = {}
 var loader_ = '<img width=20 src="/static/Phylo/libs/img2/loading-bar.gif"></img>'
 var G = {
    "params_t0" : {},
 }

+var Clusters_2DEL = {}
+var Terms_2DEL = {}
+
 var POST_ = false

 // "scontext"
@@ -154,6 +162,32 @@ function getParams(form , children_ ) {
    return p_;
 }

+$("#remove_terms").click( function(){
+
+
+})
+
+$("#remove_clusters").click( function(){
+  console.log("removing clusteeeers")
+  if (! $.isEmptyObject( selections )) {
+    for(var cID in selections) {
+      if( Nodes[cID].type=="Cluster" ) {
+	      partialGraph.dropNode(cID)
+	      try {
+	        delete Nodes[cID]
+	        delete dicts.nodes[cID]
+	        delete dicts.D2N[cID]
+	        delete Relations["1|1"][cID]
+	      } catch(err) {
+	        var xxxxx = 111
+	      }
+  	  }
+    }
+    partialGraph.refresh()
+    partialGraph.draw()
+  }
+})
+
 function send_params( D ) {

    var query = $("#pubmedquery").val().slice()
@@ -209,7 +243,7 @@ function send_params( D ) {

 $("#pubmed_fetch").bind('click', function() {
      
-    console.log( "hola mundo" )
+    console.log( "pubmed_fetch" )

    var URL = "<URL>"

@@ -236,8 +270,20 @@ $("#pubmed_fetch").bind('click', function() {
        var params_t1 = getParams("phyloform" , "input")
        var params_diff = dict_diff( params_t1 , G["params_t0"] )

+        console.log("")
+        console.log("")
+        console.log("")
+        console.log("DIFF TIMES!!!")
+        console.log( params_t1 )
+        console.log( G["params_t0"] )
+        console.log(" - - - - - - - ")
+        console.log( params_diff )
+
+        console.log("")
+        console.log("")
+        console.log("")
        // spark context has changed ->  change everything
-        if("scontext" in params_diff) {
+        if("query" in params_diff) {
          return send_params( params_t1 )
        }

@@ -331,32 +377,32 @@ function get_ngrams( query ) {
        data: DD ,
        success : function(data) { 
            console.log( "get_ngrams!!" )
-            console.log( "data:" )
-            console.log( data )
+            // console.log( "data:" )
+            // console.log( data )
            for(var i in data){
                K_i2s[ i ] = data[i]
            }
-            console.log( "K_i2s:" )
-            console.log( K_i2s )
+            // console.log( "K_i2s:" )
+            // console.log( K_i2s )

-            console.log("iter mesh_terms")
+            // console.log("iter mesh_terms")

            for(var i in dicts.nodes) {
              if( dicts.nodes[i].type=="mesh_term" ) {
+
                // console.log ( dicts.nodes[i]  )
                // console.log ( K_i2s[dicts.nodes[i].label]  )
                // console.log ( K_i2s[Number(dicts.nodes[i].label)]  )
                // console.log("")
-                console.log( dicts.nodes[i].label )
-                console.log( K_i2s[dicts.nodes[i].label] )
+                // console.log( dicts.nodes[i].label )
+                // console.log( K_i2s[dicts.nodes[i].label] )
                var ID = dicts.nodes[i].id
                var newlabel = K_i2s[dicts.nodes[i].label]
                if( typeof( newlabel )!="undefined" ) {
+                  K_oi2i [ newlabel ] = ID 
                  dicts.nodes[i].label = newlabel
                  Nodes[ ID ].label = newlabel
-                  console.log( dicts.nodes[i] )
-                  console.log( Nodes[ ID ] )
-                  console.log("")
+
                }
              }
            }
@@ -364,6 +410,12 @@ function get_ngrams( query ) {
            partialGraph.draw()


+            labels = []
+            for(var kk in K_i2s ){
+                updateSearchLabels( kk , K_i2s[kk] , "mesh_term");
+            }
+
+

        },
        error: function(jqxhr,textStatus,errorThrown) {

--- a/test.py
+++ b/test.py
@@ -136,25 +136,60 @@ def test_workflow():
 	import time
 	print("hello")

+	minsupp = 0.0001
+	numpart = 100
+
+	minfsetsize = 4
+
+
+
+	import findspark
+	findspark.init()
+	from pyspark.mllib.fpm import FPGrowth
+	from pyspark import SparkContext
+	from pyspark import SparkConf
+	cfg = SparkConf().set('spark.driver.memory', "40g").set('spark.driver.cores', 20 ).setAppName("simple_app")
+	ncores = 20
+	sc__ = SparkContext(conf=cfg)
 	from PhyloSpark import Phylo
-	periods_ = [ 1983 , 1984 ]
-	the_ = Phylo( t=periods_ , memm="20g" , ncores="24" )
+
+	periods_ = range(2003,2005+1)
+	the_ = Phylo( t=periods_ , minJ=0.0 , spark_context=sc__ , ncores=ncores )

 	# WL = getWL( the_.sc , "/datasets/PubMed2014/chikungunya.txt" )
 	WL_path = "/datasets/PubMed2014/chikungunya.txt"
-	WL = the_.sc.textFile( WL_path ).map( lambda line: (int(line.strip()) , 1) )
+	# WL_path = "/datasets/PubMed2014/gut_AND_brain.txt"
+	WL = sc__.textFile( WL_path ).map( lambda line: (int(line.strip()) , 1) )


 	# WL = getWL( the_.sc , "/datasets/PubMed2014/cell-aging.txt" )
 	# WL = getWL( the_.sc , "/datasets/PubMed2014/rheumatoid-arthritis.txt" )


-	for i in range(1983,2015):
-		# start = time.time()
+	for i in range(2003,2005+1):
+		start = time.time()
 		period = str(i)
-		print(period)
-		Psub = interDataSet( the_.sc , period , WL )
-		# print("\t",len(ress.collect()))
+		# print(period)
+		T = interDataSet( sc__ , period , WL ).map(lambda x: x[1] )
+		print("\t",period,"->",len(T.collect()))
+		# print( "\t", T.take(1))
+		model = FPGrowth.train(T, minSupport=minsupp, numPartitions=numpart)
+		FI_all_c = model.freqItemsets().count()
+		print("\t\t |FI|", FI_all_c)
+
+
+		t_i = time.time() ##
+
+		FI = model.freqItemsets().filter(lambda x: len(x.items)>=minfsetsize and x.freq>=2)
+		FI = FI.sortBy(lambda x: x.freq , ascending=False).zipWithIndex().map( lambda x : (x[1],x[0]) ).persist()
+		
+		FI_c = FI.count()
+		t_f = time.time() ##
+		FI_t = "{0:.3f}".format((t_f - t_i)) +"[s]" ##
+		print("\t\t |FI_| ", FI_c , "\t",FI_t )
+		print("")
+
+
 		# end = time.time()
 		# print("\t\t",end - start)
 		print("")