Commit 28bb9273 authored by PkSM3's avatar PkSM3

[UPDATE] progress in workflow__MOV

parent 4fbc8a98
......@@ -268,7 +268,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
from analysis.tfidf import tfidf
def do_tfidf(corpus, reset=True):
print("=========== doing tfidf ===========")
# print("=========== doing tfidf ===========")
with transaction.atomic():
if reset==True:
NodeNodeNgram.objects.filter(nodex=corpus).delete()
......@@ -278,8 +278,7 @@ def do_tfidf(corpus, reset=True):
# # for i in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
for document in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
# print("the doc:",document)
somevariable = Node_Ngram.objects.filter(node=document)
for node_ngram in somevariable:
for node_ngram in Node_Ngram.objects.filter(node=document):
# print("\tngram:",node_ngram.ngram)
try:
nnn = NodeNodeNgram.objects.get(nodex=corpus, nodey=document, ngram=node_ngram.ngram)
......
......@@ -60,6 +60,7 @@ def tfidf(corpus, document, ngram):
.filter(NodeNgram.ngram_id == ngram.id)\
.count()
# print("\t\t\t","occs:",occurrences_of_ngram," || ngramsbydoc:",ngrams_by_document," || TF = occ/ngramsbydoc:",term_frequency," |||||| x:",xx," || y:",yy," || IDF = log(x/y):",log(xx/yy))
inverse_document_frequency= log(xx/yy)
# result = tf * idf
......
......@@ -14,6 +14,7 @@ from parsing.FileParsers import *
from time import time
import datetime
from multiprocessing import Process
from math import log
from collections import defaultdict
import hashlib
......@@ -229,13 +230,6 @@ class Node(CTENode):
for ngram in extractor.extract_ngrams(self.metadata[key]):
terms = ' '.join([token for token, tag in ngram])
associations[terms] += 1
import pprint
pprint.pprint(associations)
print(" - - - - - ")
#print(associations)
# insert the occurrences in the database
# print(associations.items())
Node_Ngram.objects.bulk_create([
Node_Ngram(
node = self,
......@@ -284,6 +278,14 @@ class Node(CTENode):
def runInParallel(self, *fns):
proc = []
for fn in fns:
p = Process(target=fn)
p.start()
proc.append(p)
for p in proc:
p.join()
def parse_resources__MOV(self, verbose=False):
# parse all resources into a list of metadata
......@@ -324,7 +326,6 @@ class Node(CTENode):
language_id = language.id if language else None,
metadata = metadata_values
).save()
metadata_list[i]["thelang"] = language
# # make metadata filterable
self.children.all().make_metadata_filterable()
# # mark the resources as parsed for this node
......@@ -338,48 +339,63 @@ class Node(CTENode):
if ngramscaches is None:
ngramscaches = NgramsCaches()
results = []
i = 0
for metadata in array:
associations = defaultdict(float) # float or int?
language = langages_cache[metadata['language_iso2']] if 'language_iso2' in metadata else None,
if isinstance(language, tuple):
language = language[0]
metadata["thelang"] = language
extractor = ngramsextractorscache[language]
ngrams = ngramscaches[language]
# print("\t\t number of req keys:",len(keys)," AND isdict?:",isinstance(keys, dict))
# theText = []
if isinstance(keys, dict):
for key, weight in keys.items():
if key in metadata:
for ngram in extractor.extract_ngrams(metadata[key]):
terms = ' '.join([token for token, tag in ngram])
text2process = str(metadata[key]).replace('[','').replace(']','')
# theText.append(text2process)
for ngram in extractor.extract_ngrams(text2process):
terms = ' '.join([token for token, tag in ngram]).strip().lower()
associations[ngram] += weight
else:
for key in keys:
if key in metadata:
# print("the_content:[[[[[[__",metadata[key],"__]]]]]]")
for ngram in extractor.extract_ngrams(metadata[key]):
terms = ' '.join([token for token, tag in ngram])
text2process = str(metadata[key]).replace('[','').replace(']','')
# theText.append(text2process)
for ngram in extractor.extract_ngrams(text2process):
terms = ' '.join([token for token, tag in ngram]).strip().lower()
associations[terms] += 1
if len(associations.items())>0:
Node_Ngram.objects.bulk_create([
Node_Ngram(
node = self,
ngram = ngrams[ngram_text],
weight = weight
)
for ngram_text, weight in associations.items()
])
# for ngram_text, weight in associations.items():
# print("ngram_text:",ngram_text," | weight:",weight, " | ngrams[ngram_text]:",ngrams[ngram_text])
def runInParallel(self, *fns):
proc = []
for fn in fns:
p = Process(target=fn)
p.start()
proc.append(p)
for p in proc:
p.join()
if(len(associations)>0):
results.append( [i , associations] )
i+=1
return results
def do_tfidf__MOV( self, FreqList ):
IDFList = {}
for i in FreqList:
arrayID = i[0]
associations = i[1]
for ngram_text, weight in associations.items():
if ngram_text in IDFList: IDFList[ngram_text] += 1
else: IDFList[ngram_text] = 1
N = float(len(FreqList)) #nro docs really processed
for i in FreqList:
arrayID = i[0]
associations = i[1]
ngrams_by_document = len(associations.items())
for ngram_text, weight in associations.items():
occurrences_of_ngram = weight
term_frequency = occurrences_of_ngram / ngrams_by_document
xx = N
yy = IDFList[ngram_text]
inverse_document_frequency= log(xx/yy) #log base e
def workflow__MOV(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
import time
......@@ -394,32 +410,44 @@ class Node(CTENode):
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources()__MOV [s]",(end - start))
print("LOG::TIME: In workflow() writeMetadata__MOV()")
start = time.time()
self.writeMetadata__MOV( metadata_list=theMetadata )
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" writeMetadata__MOV() [s]",(end - start))
# print("LOG::TIME: In workflow() writeMetadata__MOV()")
# start = time.time()
# self.writeMetadata__MOV( metadata_list=theMetadata )
# end = time.time()
# total += (end - start)
# print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" writeMetadata__MOV() [s]",(end - start))
print("LOG::TIME: In workflow() extract_ngrams__MOV()")
start = time.time()
self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] )
FreqList = self.extract_ngrams__MOV(theMetadata , keys=['title',] )
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams__MOV() [s]",(end - start))
# # this is not working
# self.runInParallel( self.writeMetadata__MOV( metadata_list=theMetadata ) , self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] ) )
# start = time.time()
# print("LOG::TIME: In workflow() do_tfidf()")
# self.do_tfidf__MOV( FreqList )
# end = time.time()
# total += (end - start)
# print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
# # # print("LOG::TIME: In workflow() / do_tfidf()")
start = time.time()
print("LOG::TIME: In workflow() do_tfidf()")
from analysis.functions import do_tfidf
do_tfidf(self)
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
# # print("LOG::TIME: In workflow() / do_tfidf()")
# # # this is not working
# # self.runInParallel( self.writeMetadata__MOV( metadata_list=theMetadata ) , self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] ) )
# start = time.time()
# print("LOG::TIME: In workflow() do_tfidf()")
# from analysis.functions import do_tfidf
# do_tfidf(self)
# end = time.time()
# total += (end - start)
# print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
# # # print("LOG::TIME: In workflow() / do_tfidf()")
print("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" In workflow() END")
......
......@@ -25,6 +25,7 @@ class PubmedFileParser(FileParser):
metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle',
# "abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"title2" : 'MedlineCitation/Article/VernacularTitle',
"language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
......@@ -102,6 +103,7 @@ class PubmedFileParser(FileParser):
if "title2" in metadata: metadata.pop("title2")
# print(metadata)
# print("* * * * ** * * * * ")
metadata_list.append(metadata)
# return the list of metadata
return metadata_list
from ..Taggers import Tagger
from ..Taggers import TurboTagger
import nltk
......@@ -18,7 +18,7 @@ class NgramsExtractor:
self.stop()
def start(self):
self.tagger = Tagger()
self.tagger = TurboTagger()
def stop(self):
pass
......
......@@ -59,8 +59,10 @@ class Tagger:
self.tagging_end()
return []
"""Send a text to be tagged.
"""
# Not used right now
def tag_text(self, text):
tokens_tags = []
self.tagging_start()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment