Commit 28bb9273 authored by PkSM3's avatar PkSM3

[UPDATE] progress in workflow__MOV

parent 4fbc8a98
...@@ -268,7 +268,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150 ...@@ -268,7 +268,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
from analysis.tfidf import tfidf from analysis.tfidf import tfidf
def do_tfidf(corpus, reset=True): def do_tfidf(corpus, reset=True):
print("=========== doing tfidf ===========") # print("=========== doing tfidf ===========")
with transaction.atomic(): with transaction.atomic():
if reset==True: if reset==True:
NodeNodeNgram.objects.filter(nodex=corpus).delete() NodeNodeNgram.objects.filter(nodex=corpus).delete()
...@@ -278,8 +278,7 @@ def do_tfidf(corpus, reset=True): ...@@ -278,8 +278,7 @@ def do_tfidf(corpus, reset=True):
# # for i in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")): # # for i in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
for document in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")): for document in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
# print("the doc:",document) # print("the doc:",document)
somevariable = Node_Ngram.objects.filter(node=document) for node_ngram in Node_Ngram.objects.filter(node=document):
for node_ngram in somevariable:
# print("\tngram:",node_ngram.ngram) # print("\tngram:",node_ngram.ngram)
try: try:
nnn = NodeNodeNgram.objects.get(nodex=corpus, nodey=document, ngram=node_ngram.ngram) nnn = NodeNodeNgram.objects.get(nodex=corpus, nodey=document, ngram=node_ngram.ngram)
......
...@@ -60,6 +60,7 @@ def tfidf(corpus, document, ngram): ...@@ -60,6 +60,7 @@ def tfidf(corpus, document, ngram):
.filter(NodeNgram.ngram_id == ngram.id)\ .filter(NodeNgram.ngram_id == ngram.id)\
.count() .count()
# print("\t\t\t","occs:",occurrences_of_ngram," || ngramsbydoc:",ngrams_by_document," || TF = occ/ngramsbydoc:",term_frequency," |||||| x:",xx," || y:",yy," || IDF = log(x/y):",log(xx/yy))
inverse_document_frequency= log(xx/yy) inverse_document_frequency= log(xx/yy)
# result = tf * idf # result = tf * idf
......
...@@ -14,6 +14,7 @@ from parsing.FileParsers import * ...@@ -14,6 +14,7 @@ from parsing.FileParsers import *
from time import time from time import time
import datetime import datetime
from multiprocessing import Process from multiprocessing import Process
from math import log
from collections import defaultdict from collections import defaultdict
import hashlib import hashlib
...@@ -229,13 +230,6 @@ class Node(CTENode): ...@@ -229,13 +230,6 @@ class Node(CTENode):
for ngram in extractor.extract_ngrams(self.metadata[key]): for ngram in extractor.extract_ngrams(self.metadata[key]):
terms = ' '.join([token for token, tag in ngram]) terms = ' '.join([token for token, tag in ngram])
associations[terms] += 1 associations[terms] += 1
import pprint
pprint.pprint(associations)
print(" - - - - - ")
#print(associations)
# insert the occurrences in the database
# print(associations.items())
Node_Ngram.objects.bulk_create([ Node_Ngram.objects.bulk_create([
Node_Ngram( Node_Ngram(
node = self, node = self,
...@@ -284,6 +278,14 @@ class Node(CTENode): ...@@ -284,6 +278,14 @@ class Node(CTENode):
def runInParallel(self, *fns):
proc = []
for fn in fns:
p = Process(target=fn)
p.start()
proc.append(p)
for p in proc:
p.join()
def parse_resources__MOV(self, verbose=False): def parse_resources__MOV(self, verbose=False):
# parse all resources into a list of metadata # parse all resources into a list of metadata
...@@ -324,7 +326,6 @@ class Node(CTENode): ...@@ -324,7 +326,6 @@ class Node(CTENode):
language_id = language.id if language else None, language_id = language.id if language else None,
metadata = metadata_values metadata = metadata_values
).save() ).save()
metadata_list[i]["thelang"] = language
# # make metadata filterable # # make metadata filterable
self.children.all().make_metadata_filterable() self.children.all().make_metadata_filterable()
# # mark the resources as parsed for this node # # mark the resources as parsed for this node
...@@ -338,48 +339,63 @@ class Node(CTENode): ...@@ -338,48 +339,63 @@ class Node(CTENode):
if ngramscaches is None: if ngramscaches is None:
ngramscaches = NgramsCaches() ngramscaches = NgramsCaches()
results = []
i = 0
for metadata in array: for metadata in array:
associations = defaultdict(float) # float or int? associations = defaultdict(float) # float or int?
language = langages_cache[metadata['language_iso2']] if 'language_iso2' in metadata else None, language = langages_cache[metadata['language_iso2']] if 'language_iso2' in metadata else None,
if isinstance(language, tuple): if isinstance(language, tuple):
language = language[0] language = language[0]
metadata["thelang"] = language
extractor = ngramsextractorscache[language] extractor = ngramsextractorscache[language]
ngrams = ngramscaches[language] ngrams = ngramscaches[language]
# print("\t\t number of req keys:",len(keys)," AND isdict?:",isinstance(keys, dict)) # theText = []
if isinstance(keys, dict): if isinstance(keys, dict):
for key, weight in keys.items(): for key, weight in keys.items():
if key in metadata: if key in metadata:
for ngram in extractor.extract_ngrams(metadata[key]): text2process = str(metadata[key]).replace('[','').replace(']','')
terms = ' '.join([token for token, tag in ngram]) # theText.append(text2process)
for ngram in extractor.extract_ngrams(text2process):
terms = ' '.join([token for token, tag in ngram]).strip().lower()
associations[ngram] += weight associations[ngram] += weight
else: else:
for key in keys: for key in keys:
if key in metadata: if key in metadata:
# print("the_content:[[[[[[__",metadata[key],"__]]]]]]") text2process = str(metadata[key]).replace('[','').replace(']','')
for ngram in extractor.extract_ngrams(metadata[key]): # theText.append(text2process)
terms = ' '.join([token for token, tag in ngram]) for ngram in extractor.extract_ngrams(text2process):
terms = ' '.join([token for token, tag in ngram]).strip().lower()
associations[terms] += 1 associations[terms] += 1
if len(associations.items())>0:
Node_Ngram.objects.bulk_create([
Node_Ngram(
node = self,
ngram = ngrams[ngram_text],
weight = weight
)
for ngram_text, weight in associations.items()
])
# for ngram_text, weight in associations.items():
# print("ngram_text:",ngram_text," | weight:",weight, " | ngrams[ngram_text]:",ngrams[ngram_text])
def runInParallel(self, *fns): if(len(associations)>0):
proc = [] results.append( [i , associations] )
for fn in fns:
p = Process(target=fn) i+=1
p.start() return results
proc.append(p)
for p in proc: def do_tfidf__MOV( self, FreqList ):
p.join()
IDFList = {}
for i in FreqList:
arrayID = i[0]
associations = i[1]
for ngram_text, weight in associations.items():
if ngram_text in IDFList: IDFList[ngram_text] += 1
else: IDFList[ngram_text] = 1
N = float(len(FreqList)) #nro docs really processed
for i in FreqList:
arrayID = i[0]
associations = i[1]
ngrams_by_document = len(associations.items())
for ngram_text, weight in associations.items():
occurrences_of_ngram = weight
term_frequency = occurrences_of_ngram / ngrams_by_document
xx = N
yy = IDFList[ngram_text]
inverse_document_frequency= log(xx/yy) #log base e
def workflow__MOV(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False): def workflow__MOV(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
import time import time
...@@ -394,32 +410,44 @@ class Node(CTENode): ...@@ -394,32 +410,44 @@ class Node(CTENode):
total += (end - start) total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources()__MOV [s]",(end - start)) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources()__MOV [s]",(end - start))
print("LOG::TIME: In workflow() writeMetadata__MOV()") # print("LOG::TIME: In workflow() writeMetadata__MOV()")
start = time.time() # start = time.time()
self.writeMetadata__MOV( metadata_list=theMetadata ) # self.writeMetadata__MOV( metadata_list=theMetadata )
end = time.time() # end = time.time()
total += (end - start) # total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" writeMetadata__MOV() [s]",(end - start)) # print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" writeMetadata__MOV() [s]",(end - start))
print("LOG::TIME: In workflow() extract_ngrams__MOV()") print("LOG::TIME: In workflow() extract_ngrams__MOV()")
start = time.time() start = time.time()
self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] ) FreqList = self.extract_ngrams__MOV(theMetadata , keys=['title',] )
end = time.time() end = time.time()
total += (end - start) total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams__MOV() [s]",(end - start)) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams__MOV() [s]",(end - start))
# # this is not working # start = time.time()
# self.runInParallel( self.writeMetadata__MOV( metadata_list=theMetadata ) , self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] ) ) # print("LOG::TIME: In workflow() do_tfidf()")
# self.do_tfidf__MOV( FreqList )
# end = time.time()
# total += (end - start)
# print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
# # # print("LOG::TIME: In workflow() / do_tfidf()")
start = time.time()
print("LOG::TIME: In workflow() do_tfidf()")
from analysis.functions import do_tfidf
do_tfidf(self)
end = time.time() # # # this is not working
total += (end - start) # # self.runInParallel( self.writeMetadata__MOV( metadata_list=theMetadata ) , self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] ) )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
# # print("LOG::TIME: In workflow() / do_tfidf()") # start = time.time()
# print("LOG::TIME: In workflow() do_tfidf()")
# from analysis.functions import do_tfidf
# do_tfidf(self)
# end = time.time()
# total += (end - start)
# print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
# # # print("LOG::TIME: In workflow() / do_tfidf()")
print("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" In workflow() END") print("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" In workflow() END")
......
...@@ -25,6 +25,7 @@ class PubmedFileParser(FileParser): ...@@ -25,6 +25,7 @@ class PubmedFileParser(FileParser):
metadata_path = { metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title', "journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle', "title" : 'MedlineCitation/Article/ArticleTitle',
# "abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"title2" : 'MedlineCitation/Article/VernacularTitle', "title2" : 'MedlineCitation/Article/VernacularTitle',
"language_iso3" : 'MedlineCitation/Article/Language', "language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]', "doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
...@@ -102,6 +103,7 @@ class PubmedFileParser(FileParser): ...@@ -102,6 +103,7 @@ class PubmedFileParser(FileParser):
if "title2" in metadata: metadata.pop("title2") if "title2" in metadata: metadata.pop("title2")
# print(metadata) # print(metadata)
# print("* * * * ** * * * * ")
metadata_list.append(metadata) metadata_list.append(metadata)
# return the list of metadata # return the list of metadata
return metadata_list return metadata_list
from ..Taggers import Tagger from ..Taggers import TurboTagger
import nltk import nltk
...@@ -18,7 +18,7 @@ class NgramsExtractor: ...@@ -18,7 +18,7 @@ class NgramsExtractor:
self.stop() self.stop()
def start(self): def start(self):
self.tagger = Tagger() self.tagger = TurboTagger()
def stop(self): def stop(self):
pass pass
......
...@@ -59,8 +59,10 @@ class Tagger: ...@@ -59,8 +59,10 @@ class Tagger:
self.tagging_end() self.tagging_end()
return [] return []
"""Send a text to be tagged. """Send a text to be tagged.
""" """
# Not used right now
def tag_text(self, text): def tag_text(self, text):
tokens_tags = [] tokens_tags = []
self.tagging_start() self.tagging_start()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment