Commit 2b36c5d0 authored by PkSM3's avatar PkSM3

[UPDATE] pubmed|istex removal (for the moment)

parent fc089d47
...@@ -287,346 +287,7 @@ class Node(CTENode): ...@@ -287,346 +287,7 @@ class Node(CTENode):
print("In workflow() END") print("In workflow() END")
self.metadata['Processing'] = 0 self.metadata['Processing'] = 0
self.save() self.save()
def runInParallel(self, *fns):
proc = []
for fn in fns:
p = Process(target=fn)
p.start()
proc.append(p)
for p in proc:
p.join()
def parse_resources__MOV(self, verbose=False):
# parse all resources into a list of metadata
metadata_list = []
print("not parsed resources:")
print(self.node_resource.filter(parsed=False))
print("= = = = = = = = = = =\n")
for node_resource in self.node_resource.filter(parsed=False):
resource = node_resource.resource
parser = defaultdict(lambda:FileParser.FileParser, {
'istext' : ISText,
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
'europress_french' : EuropressFileParser,
'europress_english' : EuropressFileParser,
})[resource.type.name]()
metadata_list += parser.parse(str(resource.file))
self.node_resource.update(parsed=True) #writing to DB
return metadata_list
def writeMetadata__MOV(self, metadata_list=None , verbose=False):
type_id = NodeType.objects.get(name='Document').id
user_id = self.user.id
langages_cache = LanguagesCache()
# # insert the new resources in the database!
for i, metadata_values in enumerate(metadata_list):
name = metadata_values.get('title', '')[:200]
language = langages_cache[metadata_values['language_iso2']] if 'language_iso2' in metadata_values else None,
if isinstance(language, tuple):
language = language[0]
try:
node = Node(
user_id = user_id,
type_id = type_id,
name = name,
parent = self,
language_id = language.id if language else None,
metadata = metadata_values
)
node.save()
except Exception as e:
print("ERR::node.models.writeMetadata__MOV ")
metadata_values["id"] = node.id
# # make metadata filterable
self.children.all().make_metadata_filterable()
# # mark the resources as parsed for this node
self.node_resource.update(parsed=True)
return metadata_list
def extract_ngrams__MOV(self, array , keys , ngramsextractorscache=None, ngramscaches=None):
if ngramsextractorscache is None:
ngramsextractorscache = NgramsExtractorsCache()
langages_cache = LanguagesCache()
if ngramscaches is None:
ngramscaches = NgramsCaches()
results = []
i = 0
for metadata in array:
associations = defaultdict(float) # float or int?
language = langages_cache[metadata['language_iso2']] if 'language_iso2' in metadata else None,
if isinstance(language, tuple):
language = language[0]
extractor = ngramsextractorscache[language]
ngrams = ngramscaches[language]
# theText = []
if isinstance(keys, dict):
for key, weight in keys.items():
if key in metadata:
text2process = str(metadata[key]).replace('[','').replace(']','')
# theText.append(text2process)
for ngram in extractor.extract_ngrams(text2process):
terms = ' '.join([token for token, tag in ngram]).strip().lower()
associations[ngram] += weight
else:
for key in keys:
if key in metadata:
text2process = str(metadata[key]).replace('[','').replace(']','')
# theText.append(text2process)
for ngram in extractor.extract_ngrams(text2process):
terms = ' '.join([token for token, tag in ngram]).strip().lower()
associations[terms] += 1
if(len(associations)>0):
results.append( [metadata["id"] , associations] )
i+=1
return results
def do_tfidf__MOV( self, FreqList , Metadata , n=150):
from analysis.InterUnion import Utils
calc = Utils()
# *01* [ calculating global count of each ngram ]
GlobalCount = {}
for i in FreqList:
docID = i[0]
associations = i[1]
for ngram_text, weight in associations.items():
if len(ngram_text.split())>1 and len(ngram_text.split())<4:# considering just {2,3}-grams
if ngram_text in GlobalCount:
if "C" in GlobalCount[ngram_text]:
GlobalCount[ngram_text]["C"] += 1
else:
GlobalCount[ngram_text] = {}
GlobalCount[ngram_text]["C"] = 1
else:
GlobalCount[ngram_text] = {}
GlobalCount[ngram_text]["C"] = 1
# *01* [ / calculating global count of each ngram ]
# *02* [ Considering the first <150 ngrams by DESC occurrences ]
FirstNgrams = {}
sortedList = sorted(GlobalCount, key=lambda x: GlobalCount[x]['C'], reverse=True)
for i in range(len(sortedList)):
term = sortedList[i]
FirstNgrams[term] = {}
FirstNgrams[term]["C"] = GlobalCount[term]["C"]
if i==(n-1): break
# *02* [ / Considering the first <150 ngrams by DESC occurrences ]
N = float(len(FreqList)) #nro docs really processed
# *03* [ making dictionaries for NGram_Text <=> NGram_ID ]
NGram2ID = {}
ID2NGram = {}
ngramid = 0
for i in FirstNgrams:
NGram2ID[i] = ngramid
ID2NGram[ngramid] = i
ngramid+=1
# *03* [ / making dictionaries for NGram_Text <=> NGram_ID ]
docs_X_terms = {}
for i in FreqList: # foreach ID in Doc:
docID = i[0]
associations = i[1]
# [ considering just {2,3}-grams ]
termsCount = 0
for ngram_text, weight in associations.items():
if ngram_text in NGram2ID: # considering just {2,3}-grams
termsCount+=1
# [ / considering just {2,3}-grams ]
ngrams_by_document = termsCount # i re-calculed this because of *02*
terms = []
terms_occ = []
if ngrams_by_document > 0:
for ngram_text, weight in associations.items():
if ngram_text in NGram2ID:
terms.append(NGram2ID[ngram_text])
# [ calculating TF-IDF ]
occurrences_of_ngram = weight
term_frequency = occurrences_of_ngram / ngrams_by_document
xx = N
yy = FirstNgrams[ngram_text]["C"]
inverse_document_frequency= log(xx/yy) #log base e
tfidfScore = term_frequency*inverse_document_frequency
terms_occ.append( [ NGram2ID[ngram_text] , round(tfidfScore,3) ] )
# [ / calculating TF-IDF ]
if "T" in FirstNgrams[ngram_text]:
FirstNgrams[ngram_text]["T"].append(tfidfScore)
else:
FirstNgrams[ngram_text]["T"] = [tfidfScore]
if len(terms)>1:
docs_X_terms[docID] = terms_occ
# print("docid:",docID)
# for i in terms:
# print("\t",ID2NGram[i])
calc.addCompleteSubGraph(terms)
return { "G":calc.G , "TERMS": ID2NGram , "ii":docs_X_terms ,"metrics":FirstNgrams }
def do_coocmatrix__MOV(self , TERMS , G , n=150 , type='node_link'):
import pandas as pd
from copy import copy
import numpy as np
import networkx as nx
from networkx.readwrite import json_graph
from gargantext_web.api import JsonHttpResponse
from analysis.louvain import best_partition
matrix = defaultdict(lambda : defaultdict(float))
ids = dict()
labels = dict()
weight = dict()
print("PRINTING NUMBER OF NODES 01:",len(G))
for e in G.edges_iter():
n1 = e[0]
n2 = e[1]
w = G[n1][n2]['weight']
# print(n1," <=> ",n2, " : ", G[n1][n2]['weight'],"\t",TERMS[n1]," <=> ",TERMS[n2], "\t", G[n1][n2]['weight'])
ids[TERMS[n1]] = n1
ids[TERMS[n2]] = n2
labels[n1] = TERMS[n1]
labels[n2] = TERMS[n2]
matrix[ n1 ][ n2 ] = w
matrix[ n2 ][ n1 ] = w
weight[n2] = weight.get( n2, 0) + w
weight[n1] = weight.get( n1, 0) + w
df = pd.DataFrame(matrix).fillna(0)
x = copy(df.values)
x = x / x.sum(axis=1)
# Removing unconnected nodes
threshold = min(x.max(axis=1))
matrix_filtered = np.where(x >= threshold, 1, 0)
#matrix_filtered = np.where(x > threshold, x, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
G = nx.from_numpy_matrix(matrix_filtered)
# G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(df.columns)])))
partition = best_partition(G)
data = []
if type == "node_link":
for community in set(partition.values()):
G.add_node("cluster " + str(community), hidden=1)
for node in G.nodes():
try:
G.node[node]['label'] = TERMS[node]
G.node[node]['pk'] = node
G.node[node]['size'] = weight[node]
G.node[node]['group'] = partition[node]
G.add_edge(node, "cluster " + str(partition[node]), weight=3)
except Exception as error:
print("ERROR:",error)
print("PRINTING NUMBER OF NODES 02:",len(G))
data = json_graph.node_link_data(G)
elif type == "adjacency":
for node in G.nodes():
try:
#node,type(labels[node])
#G.node[node]['label'] = node
G.node[node]['name'] = node
#G.node[node]['size'] = weight[node]
G.node[node]['group'] = partition[node]
#G.add_edge(node, partition[node], weight=3)
except Exception as error:
print(error)
data = json_graph.node_link_data(G)
return data
def workflow__MOV(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
import time
total = 0
self.metadata['Processing'] = 1
self.save()
# # pwd = subprocess.Popen("cd /srv/gargantext/parsing/Taggers/nlpserver && pwd", stdout=subprocess.PIPE).stdout.read()
# # print(subprocess.Popen(['ls', '-lah'], stdout=subprocess.PIPE).communicate()[0].decode('utf-8'))
# print("activating nlpserver:")
# command = 'cd parsing/Taggers/nlpserver; python3 server.py'
# process = subprocess.Popen(command,stdout=subprocess.PIPE , stderr=subprocess.DEVNULL , shell=True)
print("LOG::TIME: In workflow() parse_resources__MOV()")
start = time.time()
theMetadata = self.parse_resources__MOV()
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources()__MOV [s]",(end - start))
print("LOG::TIME: In workflow() writeMetadata__MOV()")
start = time.time()
theMetadata = self.writeMetadata__MOV( metadata_list=theMetadata )
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" writeMetadata__MOV() [s]",(end - start))
print("LOG::TIME: In workflow() extract_ngrams__MOV()")
start = time.time()
FreqList = self.extract_ngrams__MOV(theMetadata , keys=['title'] )
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams__MOV() [s]",(end - start))
# process.kill()
# print("ok, process killed")
start = time.time()
print("LOG::TIME: In workflow() do_tfidf()")
resultDict = self.do_tfidf__MOV( FreqList , theMetadata)
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
# # print("LOG::TIME: In workflow() / do_tfidf()")
start = time.time()
print("LOG::TIME: In workflow() do_coocmatrix()")
jsongraph = self.do_coocmatrix__MOV ( resultDict["TERMS"] , resultDict["G"] , n=150)
jsongraph["stats"] = resultDict["ii"]
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_coocmatrix() [s]",(end - start))
print("the user:",self.user)
print("the project id:",self.parent.id)
print("the corpus id:",self.id)
# timestamp = str(datetime.datetime.now().isoformat())
# # filename = MEDIA_ROOT + '/corpora/%s/%s_%s__%s.json' % (self.user , self.parent.id, self.id , timestamp)
filename = MEDIA_ROOT + '/corpora/%s/%s_%s.json' % (self.user , self.parent.id, self.id)
import json
f = open(filename,"w")
f.write( json.dumps(jsongraph) )
f.close()
# # # this is not working
# # self.runInParallel( self.writeMetadata__MOV( metadata_list=theMetadata ) , self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] ) )
print("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" In workflow() END")
self.metadata['Processing'] = 0
self.save()
class Node_Metadata(models.Model): class Node_Metadata(models.Model):
node = models.ForeignKey(Node, on_delete=models.CASCADE) node = models.ForeignKey(Node, on_delete=models.CASCADE)
......
...@@ -73,75 +73,75 @@ def doTheQuery(request , project_id): ...@@ -73,75 +73,75 @@ def doTheQuery(request , project_id):
alist = ["hola","mundo"] alist = ["hola","mundo"]
if request.method == "POST": if request.method == "POST":
query = request.POST["query"] # query = request.POST["query"]
name = request.POST["string"] # name = request.POST["string"]
instancia = MedlineFetcher() # instancia = MedlineFetcher()
thequeries = json.loads(query) # thequeries = json.loads(query)
urlreqs = [] # urlreqs = []
for yearquery in thequeries: # for yearquery in thequeries:
urlreqs.append( instancia.medlineEfetchRAW( yearquery ) ) # urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
alist = ["tudo fixe" , "tudo bem"] # alist = ["tudo fixe" , "tudo bem"]
""" # """
urlreqs: List of urls to query. # urlreqs: List of urls to query.
- Then, to each url in urlreqs you do: # - Then, to each url in urlreqs you do:
eFetchResult = urlopen(url) # eFetchResult = urlopen(url)
eFetchResult.read() # this will output the XML... normally you write this to a XML-file. # eFetchResult.read() # this will output the XML... normally you write this to a XML-file.
""" # """
thefile = "how we do this here?" # thefile = "how we do this here?"
resource_type = ResourceType.objects.get(name="pubmed" ) # resource_type = ResourceType.objects.get(name="pubmed" )
parent = Node.objects.get(id=project_id) # parent = Node.objects.get(id=project_id)
node_type = NodeType.objects.get(name='Corpus') # node_type = NodeType.objects.get(name='Corpus')
type_id = NodeType.objects.get(name='Document').id # type_id = NodeType.objects.get(name='Document').id
user_id = User.objects.get( username=request.user ).id # user_id = User.objects.get( username=request.user ).id
corpus = Node( # corpus = Node(
user=request.user, # user=request.user,
parent=parent, # parent=parent,
type=node_type, # type=node_type,
name=name, # name=name,
) # )
corpus.save() # corpus.save()
tasks = MedlineFetcher() # tasks = MedlineFetcher()
for i in range(8): # for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do # t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits. # t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start() # t.start()
for url in urlreqs: # for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat())) # filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
tasks.q.put( [url , filename]) #put a task in th queue # tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished # tasks.q.join() # wait until everything is finished
dwnldsOK = 0 # dwnldsOK = 0
for filename in tasks.firstResults: # for filename in tasks.firstResults:
if filename!=False: # if filename!=False:
corpus.add_resource( user=request.user, type=resource_type, file=filename ) # corpus.add_resource( user=request.user, type=resource_type, file=filename )
dwnldsOK+=1 # dwnldsOK+=1
if dwnldsOK == 0: return JsonHttpResponse(["fail"]) # if dwnldsOK == 0: return JsonHttpResponse(["fail"])
# do the WorkFlow # # do the WorkFlow
try: # try:
if DEBUG is True: # if DEBUG is True:
# corpus.workflow() # old times... # # corpus.workflow() # old times...
corpus.workflow__MOV() # corpus.workflow__MOV()
# corpus.write_everything_to_DB() # # corpus.write_everything_to_DB()
else: # else:
# corpus.workflow.apply_async((), countdown=3) # # corpus.workflow.apply_async((), countdown=3)
corpus.workflow__MOV().apply_async((), countdown=3) # synchronous! because is faaast # corpus.workflow__MOV().apply_async((), countdown=3) # synchronous! because is faaast
# corpus.write_everything_to_DB.apply_async((), countdown=3) # asynchronous # # corpus.write_everything_to_DB.apply_async((), countdown=3) # asynchronous
return JsonHttpResponse(["workflow","finished"]) # return JsonHttpResponse(["workflow","finished"])
except Exception as error: # except Exception as error:
print(error) # print(error)
return JsonHttpResponse(["workflow","finished","outside the try-except"]) return JsonHttpResponse(["out of service for the moment"])
data = alist data = alist
return JsonHttpResponse(data) return JsonHttpResponse(data)
...@@ -164,59 +164,59 @@ def testISTEX(request , project_id): ...@@ -164,59 +164,59 @@ def testISTEX(request , project_id):
print(query_string , query , N) print(query_string , query , N)
urlreqs = [] # urlreqs = []
pagesize = 50 # pagesize = 50
tasks = MedlineFetcher() # tasks = MedlineFetcher()
chunks = list(tasks.chunks(range(N), pagesize)) # chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks: # for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0] # if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize)) # urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
print(urlreqs) # print(urlreqs)
urlreqs = ["http://localhost/374255" , "http://localhost/374278" ] # urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
print(urlreqs) # print(urlreqs)
resource_type = ResourceType.objects.get(name="istext" ) # resource_type = ResourceType.objects.get(name="istext" )
parent = Node.objects.get(id=project_id) # parent = Node.objects.get(id=project_id)
node_type = NodeType.objects.get(name='Corpus') # node_type = NodeType.objects.get(name='Corpus')
type_id = NodeType.objects.get(name='Document').id # type_id = NodeType.objects.get(name='Document').id
user_id = User.objects.get( username=request.user ).id # user_id = User.objects.get( username=request.user ).id
corpus = Node( # corpus = Node(
user=request.user, # user=request.user,
parent=parent, # parent=parent,
type=node_type, # type=node_type,
name=query, # name=query,
) # )
corpus.save() # corpus.save()
# configuring your queue with the event # # configuring your queue with the event
for i in range(8): # for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do # t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits. # t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start() # t.start()
for url in urlreqs: # for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond)) # filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond))
tasks.q.put( [url , filename]) #put a task in th queue # tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished # tasks.q.join() # wait until everything is finished
for filename in tasks.firstResults: # for filename in tasks.firstResults:
corpus.add_resource( user=request.user, type=resource_type, file=filename ) # corpus.add_resource( user=request.user, type=resource_type, file=filename )
corpus.save() # corpus.save()
print("DEBUG:",DEBUG) # print("DEBUG:",DEBUG)
# do the WorkFlow # # do the WorkFlow
try: # try:
if DEBUG is True: # if DEBUG is True:
corpus.workflow() # corpus.workflow()
else: # else:
corpus.workflow.apply_async((), countdown=3) # corpus.workflow.apply_async((), countdown=3)
return JsonHttpResponse(["workflow","finished"]) # return JsonHttpResponse(["workflow","finished"])
except Exception as error: # except Exception as error:
print(error) # print(error)
data = [query_string,query,N] data = [query_string,query,N]
return JsonHttpResponse(data) return JsonHttpResponse(data)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment