Commit 61dbb71b authored by Castillo's avatar Castillo

stable version

parent d414ea78
......@@ -10,6 +10,9 @@ import os
import datetime
import pprint
import itertools
import time
from itertools import combinations
import networkx as nx
def lineal_comparisons( years ):
D = {}
......@@ -113,7 +116,11 @@ class Period:
# Psub = P.join( WL ).map(lambda x: (x[0],x[1][0]))
T = P.join( WL ).map(lambda x: x[1][0] )
t_i = time.time() ##
self.P_c = T.count()
t_f = time.time() ##
T_t = "{0:.3f}".format((t_f - t_i)) +"[s]" ##
# # saving term frequencies
# self.TF = T.flatMap(lambda xs: [x for x in xs]).map(lambda x: (x, 1))
......@@ -127,6 +134,8 @@ class Period:
# - - - - - - - - - - - - - - - - - - - - - #
# self.FI = model.freqItemsets().filter(lambda x: len(x.items)>=minfsetsize and x.freq>=2)
# .sortBy(lambda x: x.freq , ascending=False).zipWithIndex().map( lambda x: ( x[1] , x[0][0] , x[0][1] ) ).persist()
t_i = time.time() ##
self.FI = model.freqItemsets().filter(lambda x: len(x.items)>=minfsetsize and x.freq>=2)
self.FI = self.FI.sortBy(lambda x: x.freq , ascending=False).zipWithIndex().map( lambda x : (x[1],x[0]) ).persist()
#.filter(lambda x: x[1]<=100).map( lambda x: ( x[1] , x[0][0] , x[0][1] ) ).persist()
......@@ -140,6 +149,10 @@ class Period:
# # # # = = [ / Extracting Frequent Itemsets ] = = = # # #
self.FI_c = self.FI.count()
t_f = time.time() ##
FI_t = "{0:.3f}".format((t_f - t_i)) +"[s]" ##
# # print("")
# # print("")
# # print("----FI----",year)
......@@ -174,7 +187,8 @@ class Period:
# print( i )
# print( T.take(3) )
print( "\t\t\t|FIs|"," -> ",self.FI_c )
print( "\t\t\t|T|"," -> ",self.P_c , "\t",T_t )
print( "\t\t\t|FI|"," -> ",self.FI_c , "\t",FI_t )
print("")
return self.P_c , self.FI_c
......@@ -195,6 +209,7 @@ class Phylo:
"from_": { "type": int , "value": -1 },
"to_": { "type": int , "value": -1 },
"minfpgsupp": { "type": float , "value": 0.0001 },
"minfsetsupp": { "type": int , "value": 2 },
"minfsetsize": { "type": int , "value": 4 },
"minsetdistance": { "type": int , "value": 0 },
"mram": { "type": int , "value": 40 },
......@@ -235,7 +250,11 @@ class Phylo:
N = self.yearsD
K = self.minK
if len(WL)>0:
WL = self.sc.parallelize( WL ).map( lambda x: (int(x) , 1) )
else:
WL = self.WL
Distribution = {}
for y in t :
period_ = Period( some_sc=self.sc , period=y , numpart=self.partitions )
......@@ -276,8 +295,10 @@ class Phylo:
}
def get_opossites( self , found_distances ):
def get_opossites( self , found_distances=[] , filter_s = {} ):
print( "AAAAAAH" )
print( filter_s )
data = {}
Nodes = {}
......@@ -304,8 +325,14 @@ class Phylo:
for y in period_nodes:
clusters = self.sc.parallelize( period_nodes[y] )
R = self.yearsD[y].FI.join( clusters ).map(lambda x : [ x[0] , list(x[1][0].items) , x[1][0].freq ] ).collect()
for i in R:
R = self.yearsD[y].FI.join( clusters ).map(lambda x : [x[0] , list(x[1][0].items) , x[1][0].freq ] )
if "minfsetsupp" in filter_s:
R = R.filter( lambda x : x[2]>=filter_s["minfsetsupp"] )
if "minfsetsize" in filter_s:
R = R.filter( lambda x : len(x[1])>=filter_s["minfsetsize"] )
RR = R.collect()
# pprint.pprint( RR )
for i in RR:
cID = str(y)+"c"+str(i[0])
if cID not in data:
data[ cID ] = {
......@@ -318,7 +345,10 @@ class Phylo:
def filter_jaccard(self , jacc_min ):
def filter_jaccard(self , filter_s = {} ):
jacc_min = filter_s["jacc_min"]
f__ = filter_s
# print("\tin filter_jaccard!!")
rname = datetime.datetime.now().isoformat()+""
......@@ -338,17 +368,17 @@ class Phylo:
print( "\t",jacc_min,"-> |JACCARD|:",len(found_distances) )
timerange = [ 1982 , 2014 ]
phylojson = lll.export_phylo( liens=found_distances , T=timerange , jacc_min=jacc_min )
phylojson, Parents = lll.export_phylo( liens=found_distances , T=timerange , jacc_min=jacc_min )
nodes_md = self.get_opossites( found_distances )
nodes_md = self.get_opossites( found_distances , filter_s )
nB2A = {}
nA2B = {}
NodesD_i2s = {}
NodesD_s2i = {}
NodesC = 0
for IDA_o in nodes_md:
for IDA_o in sorted(nodes_md.keys()):
IDA_s = "A_"+str(IDA_o)
if IDA_s not in NodesD_s2i:
NodesC += 1
......@@ -361,6 +391,7 @@ class Phylo:
items_ = {}
# print( IDA_o )
for ii in nodes_md[IDA_o]["items"]:
IDB_s = "B_"+str( ii )
if IDB_s not in NodesD_s2i:
......@@ -373,7 +404,8 @@ class Phylo:
NodesD_s2i[ IDB_s ] = NodesC
IDB_i = NodesD_s2i[ IDB_s ]
items_[ IDB_i ] = True
# print("\t",sorted(items_))
# print("")
nA2B[ NodesD_s2i[ IDA_s ] ] = items_
for i in items_:
......@@ -381,6 +413,54 @@ class Phylo:
nB2A[i] = {}
nB2A[i][ NodesD_s2i[ IDA_s ] ] = True
# ETAGES = {}
# print("")
# print("PARENTS!!:")
# for p in sorted( Parents.keys() ):
# TO_MERGE = nx.Graph()
# p_items = sorted(nA2B[ NodesD_s2i[ "A_"+str(p) ] ])
# print(p ,":", p_items)
# p_children = sorted(Parents[p])
# p_children_D = {}
# for j in p_children:
# child_items = nA2B[ NodesD_s2i[ "A_"+str(j) ] ]
# print("\t",j ,":", sorted(child_items) )
# if j in Parents:
# j_children = Parents[j]
# if j not in p_children_D:
# p_children_D[ j ] = set( j_children )
# if len(p_children_D)>0:
# for i in p_children_D:
# TO_MERGE.add_node( i )
# p_j_children_pairs = combinations(p_children_D.keys(), 2)
# for cc in p_j_children_pairs:
# CID1 = cc[0]
# CID2 = cc[1]
# if p_children_D[ CID1 ] == p_children_D[ CID2 ]:
# print( "\t\t\tsame content:",CID1,CID2 )
# TO_MERGE.add_edge( CID1 , CID2 )
# print("")
# h = nx.connected_components(TO_MERGE)
# for ss in h:
# if len(ss)>1:
# print("\t\t\t",ss)
# # print(ss)
# merge_this = {}
# for ss_i in ss:
# print("\t\t\t\t",ss_i)
# # merge_this.union( p_children_D[ ss_i ] )
# elems = p_children_D[ ss_i ]
# for ll in elems:
# merge_this[ ll ] = True
# merge_this = set(merge_this.keys())
# print("\t\t\t",merge_this )
# print("")
# print("")
from n_partite_graph import nPartiteGraph
bg = nPartiteGraph()
graph_b = bg.BiGraph_2( nA2B , nB2A )
......@@ -443,6 +523,8 @@ class Phylo:
s = "A_"+str(s_)
t = "A_"+str(t_)
# print( NodesD_s2i[ s ] ,"->", NodesD_s2i[ t ] )
if "hidden" not in i:
if s in NodesD_s2i and t in NodesD_s2i:
ID_s = NodesD_s2i[ s ]
ID_t = NodesD_s2i[ t ]
......@@ -470,12 +552,17 @@ class Phylo:
for cID in phylojson["nodes"]:
ID_s = "A_"+str(cID)
try:
if ID_s in NodesD_s2i:
ID_i = NodesD_s2i[ ID_s ]
# print( cID ,":",ID_i )
node_ = phylojson["nodes"][cID]
node_["id"] = ID_i
node_["label"] = cID
if cID in nodes_md :
if "supp" in nodes_md[cID]:
node_["supp"] = nodes_md[cID]["supp"]
# node_["shape"] = "square"
# node_["type"] = "Cluster"
# "x":float(coord[0]) ,
......@@ -498,40 +585,6 @@ class Phylo:
xxx = 10
# a year-node
# # return { "nodes": [] , "links": [] }
# # # print("")
# # print(" - - - - - -")
# # print("")
# # for i in graphArray["links"]:
# # print("_ ",i)
# # print("_ ","")
# # for i in graphArray["nodes"]:
# # print( i)
# # print( "")
# # print( " - - - - - - - - -")
# # for i in graphArray["links"]:
# # print( i)
# # print( "")
# Nodes_DD = {}
# for i in Nodes:
# print("_ ",i["id"])
# Nodes_DD[ i["id"] ] = i
# # print("_ ","")
# print("_ "," - - - - - - - - -")
# for i in Links:
# print("_ ",i["s"] ,"->", i["t"] )
# print( Nodes_DD[ i["s"] ] )
# print( Nodes_DD[ i["t"] ] )
# print("")
# print("_ "," - - - - - - - - -")
# # print( "|V_phy|:", len(phylojson["nodes"]))
# # print( "|E_phy|:", len(phylojson["links"]))
# # print( "|V|:", len(graphArray["nodes"]))
......@@ -601,17 +654,13 @@ class Phylo:
return { "diff_time": { "Distribution": Distribution , "years":nyears , "pairs":pairs , "pairsD":pairsD } }
# print("")
# print( "old jacc:", self.minjacc )
# print( "new jacc:", p_["minjaccard"] )
# phylojson = self.filter_jaccard ( p_["minjaccard"] )
# print("")
return None
return {}
......
......@@ -168,9 +168,9 @@ class PhyloMaker:
for y in years:
AG.add_node(str(y), label=y , fake=True ,shape="plaintext")
for i in range(len(years)):
for i in sorted(years):
try:
AG.add_edge(str(years[i]),str(years[i+1]),fake=True)
AG.add_edge(str(i),str(i+1),fake=True,weight=1)
except:
pass
# - - - - - [ / Adding yearly-graph ] - - - - - #
......@@ -226,8 +226,18 @@ class PhyloMaker:
# redundant_ = nx.DiGraph()
# for n in AG.nodes_iter():
# node = AG.node[n]
Parents = { }
for n in AG.nodes_iter():
node = AG.node[n]
if "fake" not in node:
succesors = AG.neighbors( n )
if len( succesors )>0:
Parents[n] = sorted( succesors )
# print( n )
# for j in succesors:
# print( "\t",j )
# print("- - - - ")
# print("")
# if "fake" not in node:
# parents = AG.predecessors( n )
# if len(parents)>=2:
......@@ -350,9 +360,9 @@ class PhyloMaker:
for e in B.edges_iter():
s = e[0]
t = e[1]
# if "fake" not in AG[s][t]:
# print(e)
infodict = {"s":s , "t":t , "w":AG[s][t]["weight"] , "type":"line" }
if "fake" in AG[s][t]:
infodict["hidden"] = True
EdgesDict.append(infodict)
Graph = {
......@@ -366,4 +376,4 @@ class PhyloMaker:
end = time.time()
print(float("{0:.2f}".format(end - start)),"[s] : dot layout FIN")
return Graph
return Graph, Parents
......@@ -111,7 +111,7 @@ def close_contexts():
def test_post():
pprint.pprint( request )
query = "void"
GG = False
GG = { "nodes": [] , "links": [] }
stats = False
records = { "Count": 0 }
if request.method == "POST":
......@@ -218,9 +218,15 @@ def test_post():
# # pairs of years to be multiplied
I[ sID ].temp_matching( thepairs = pairs )
GG = I[ sID ].filter_jaccard ( jacc_min=minjaccard )
filters_ = {
"jacc_min": I[ sID ].p["minsetdistance"]["value"],
"minfsetsize": I[ sID ].p["minfsetsize"]["value"],
"minfsetsupp": I[ sID ].p["minfsetsupp"]["value"],
}
GG = I[ sID ].filter_jaccard ( filter_s=filters_ )
if len( GG["links"] )>0:
Ya = p_["from_"]
Yb = p_["to_"]
GG_v = str( len( GG["nodes"] ) )
......@@ -236,6 +242,7 @@ def test_post():
pprint.pprint( I[ sID ].p )
print( "" )
params_ = {}
# # Updating I[ sID ].p parameters with new ones # #
for k in p_:
if "scontext"!=k:
try:
......@@ -258,8 +265,25 @@ def test_post():
I[ sID ].pairsD = diff_ress["diff_time"]["pairsD"]
I[ sID ].temp_matching( thepairs = diff_ress["diff_time"]["pairs"] )
GG = I[ sID ].filter_jaccard ( jacc_min=I[ sID ].p["minsetdistance"]["value"] )
filters_ = {
"jacc_min": I[ sID ].p["minsetdistance"]["value"],
"minfsetsize": I[ sID ].p["minfsetsize"]["value"],
"minfsetsupp": I[ sID ].p["minfsetsupp"]["value"],
}
GG = I[ sID ].filter_jaccard ( filter_s=filters_ )
else:
pairs = I[ sID ].pairs
I[ sID ].temp_matching( thepairs = pairs )
filters_ = {
"jacc_min": I[ sID ].p["minsetdistance"]["value"],
"minfsetsize": I[ sID ].p["minfsetsize"]["value"],
"minfsetsupp": I[ sID ].p["minfsetsupp"]["value"],
}
GG = I[ sID ].filter_jaccard ( filter_s=filters_ )
if len( GG["links"] )>0:
Ya = str(I[ sID ].p["from_"]["value"])
Yb = str(I[ sID ].p["to_"]["value"])
GG_v = str( len( GG["nodes"] ) )
......
......@@ -126,13 +126,21 @@ function dict_diff(obj1, obj2) {
}
var K_i2s = {}
var K_oi2i = {}
// var K_s2i = {}
var loader_ = '<img width=20 src="/static/Phylo/libs/img2/loading-bar.gif"></img>'
var G = {
"params_t0" : {},
}
var Clusters_2DEL = {}
var Terms_2DEL = {}
var POST_ = false
// "scontext"
......@@ -154,6 +162,32 @@ function getParams(form , children_ ) {
return p_;
}
$("#remove_terms").click( function(){
})
$("#remove_clusters").click( function(){
console.log("removing clusteeeers")
if (! $.isEmptyObject( selections )) {
for(var cID in selections) {
if( Nodes[cID].type=="Cluster" ) {
partialGraph.dropNode(cID)
try {
delete Nodes[cID]
delete dicts.nodes[cID]
delete dicts.D2N[cID]
delete Relations["1|1"][cID]
} catch(err) {
var xxxxx = 111
}
}
}
partialGraph.refresh()
partialGraph.draw()
}
})
function send_params( D ) {
var query = $("#pubmedquery").val().slice()
......@@ -209,7 +243,7 @@ function send_params( D ) {
$("#pubmed_fetch").bind('click', function() {
console.log( "hola mundo" )
console.log( "pubmed_fetch" )
var URL = "<URL>"
......@@ -236,8 +270,20 @@ $("#pubmed_fetch").bind('click', function() {
var params_t1 = getParams("phyloform" , "input")
var params_diff = dict_diff( params_t1 , G["params_t0"] )
console.log("")
console.log("")
console.log("")
console.log("DIFF TIMES!!!")
console.log( params_t1 )
console.log( G["params_t0"] )
console.log(" - - - - - - - ")
console.log( params_diff )
console.log("")
console.log("")
console.log("")
// spark context has changed -> change everything
if("scontext" in params_diff) {
if("query" in params_diff) {
return send_params( params_t1 )
}
......@@ -331,32 +377,32 @@ function get_ngrams( query ) {
data: DD ,
success : function(data) {
console.log( "get_ngrams!!" )
console.log( "data:" )
console.log( data )
// console.log( "data:" )
// console.log( data )
for(var i in data){
K_i2s[ i ] = data[i]
}
console.log( "K_i2s:" )
console.log( K_i2s )
// console.log( "K_i2s:" )
// console.log( K_i2s )
console.log("iter mesh_terms")
// console.log("iter mesh_terms")
for(var i in dicts.nodes) {
if( dicts.nodes[i].type=="mesh_term" ) {
// console.log ( dicts.nodes[i] )
// console.log ( K_i2s[dicts.nodes[i].label] )
// console.log ( K_i2s[Number(dicts.nodes[i].label)] )
// console.log("")
console.log( dicts.nodes[i].label )
console.log( K_i2s[dicts.nodes[i].label] )
// console.log( dicts.nodes[i].label )
// console.log( K_i2s[dicts.nodes[i].label] )
var ID = dicts.nodes[i].id
var newlabel = K_i2s[dicts.nodes[i].label]
if( typeof( newlabel )!="undefined" ) {
K_oi2i [ newlabel ] = ID
dicts.nodes[i].label = newlabel
Nodes[ ID ].label = newlabel
console.log( dicts.nodes[i] )
console.log( Nodes[ ID ] )
console.log("")
}
}
}
......@@ -364,6 +410,12 @@ function get_ngrams( query ) {
partialGraph.draw()
labels = []
for(var kk in K_i2s ){
updateSearchLabels( kk , K_i2s[kk] , "mesh_term");
}
},
error: function(jqxhr,textStatus,errorThrown) {
......
......@@ -136,25 +136,60 @@ def test_workflow():
import time
print("hello")
minsupp = 0.0001
numpart = 100
minfsetsize = 4
import findspark
findspark.init()
from pyspark.mllib.fpm import FPGrowth
from pyspark import SparkContext
from pyspark import SparkConf
cfg = SparkConf().set('spark.driver.memory', "40g").set('spark.driver.cores', 20 ).setAppName("simple_app")
ncores = 20
sc__ = SparkContext(conf=cfg)
from PhyloSpark import Phylo
periods_ = [ 1983 , 1984 ]
the_ = Phylo( t=periods_ , memm="20g" , ncores="24" )
periods_ = range(2003,2005+1)
the_ = Phylo( t=periods_ , minJ=0.0 , spark_context=sc__ , ncores=ncores )
# WL = getWL( the_.sc , "/datasets/PubMed2014/chikungunya.txt" )
WL_path = "/datasets/PubMed2014/chikungunya.txt"
WL = the_.sc.textFile( WL_path ).map( lambda line: (int(line.strip()) , 1) )
# WL_path = "/datasets/PubMed2014/gut_AND_brain.txt"
WL = sc__.textFile( WL_path ).map( lambda line: (int(line.strip()) , 1) )
# WL = getWL( the_.sc , "/datasets/PubMed2014/cell-aging.txt" )
# WL = getWL( the_.sc , "/datasets/PubMed2014/rheumatoid-arthritis.txt" )
for i in range(1983,2015):
# start = time.time()
for i in range(2003,2005+1):
start = time.time()
period = str(i)
print(period)
Psub = interDataSet( the_.sc , period , WL )
# print("\t",len(ress.collect()))
# print(period)
T = interDataSet( sc__ , period , WL ).map(lambda x: x[1] )
print("\t",period,"->",len(T.collect()))
# print( "\t", T.take(1))
model = FPGrowth.train(T, minSupport=minsupp, numPartitions=numpart)
FI_all_c = model.freqItemsets().count()
print("\t\t |FI|", FI_all_c)
t_i = time.time() ##
FI = model.freqItemsets().filter(lambda x: len(x.items)>=minfsetsize and x.freq>=2)
FI = FI.sortBy(lambda x: x.freq , ascending=False).zipWithIndex().map( lambda x : (x[1],x[0]) ).persist()
FI_c = FI.count()
t_f = time.time() ##
FI_t = "{0:.3f}".format((t_f - t_i)) +"[s]" ##
print("\t\t |FI_| ", FI_c , "\t",FI_t )
print("")
# end = time.time()
# print("\t\t",end - start)
print("")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment