Commit 212c9b2a authored by Administrator's avatar Administrator

Merge branch 'testing' into stable

parents 023405ac ea5c610f
...@@ -268,19 +268,20 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150 ...@@ -268,19 +268,20 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
from analysis.tfidf import tfidf from analysis.tfidf import tfidf
def do_tfidf(corpus, reset=True): def do_tfidf(corpus, reset=True):
print("doing tfidf") print("=========== doing tfidf ===========")
with transaction.atomic(): with transaction.atomic():
if reset==True: if reset==True:
NodeNodeNgram.objects.filter(nodex=corpus).delete() NodeNodeNgram.objects.filter(nodex=corpus).delete()
if isinstance(corpus, Node) and corpus.type.name == "Corpus": if isinstance(corpus, Node) and corpus.type.name == "Corpus":
# print("\n- - - - - - - - - - ") # print("\n- - - - - - - - - - ")
# for i in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")): # # for i in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
# print("^^^",i)
for document in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")): for document in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
for node_ngram in Node_Ngram.objects.filter(node=document): # print("the doc:",document)
somevariable = Node_Ngram.objects.filter(node=document)
for node_ngram in somevariable:
try: try:
# print("\t",node_ngram.ngram) # print("\tngram:",node_ngram.ngram, " @@@ type:",type(node_ngram.ngram))
nnn = NodeNodeNgram.objects.get(nodex=corpus, nodey=document, ngram=node_ngram.ngram) nnn = NodeNodeNgram.objects.get(nodex=corpus, nodey=document, ngram=node_ngram.ngram)
except: except:
score = tfidf(corpus, document, node_ngram.ngram) score = tfidf(corpus, document, node_ngram.ngram)
......
...@@ -10,8 +10,11 @@ from cte_tree.models import CTENode, CTENodeManager ...@@ -10,8 +10,11 @@ from cte_tree.models import CTENode, CTENodeManager
from parsing.Caches import LanguagesCache, NgramsExtractorsCache, NgramsCaches from parsing.Caches import LanguagesCache, NgramsExtractorsCache, NgramsCaches
from parsing.FileParsers import * from parsing.FileParsers import *
from time import time from time import time
import datetime import datetime
from multiprocessing import Process
from collections import defaultdict from collections import defaultdict
import hashlib import hashlib
...@@ -175,12 +178,6 @@ class Node(CTENode): ...@@ -175,12 +178,6 @@ class Node(CTENode):
'europress_english' : EuropressFileParser, 'europress_english' : EuropressFileParser,
})[resource.type.name]() })[resource.type.name]()
metadata_list += parser.parse(str(resource.file)) metadata_list += parser.parse(str(resource.file))
# print(parser.parse(str(resource.file)))
# # retrieve info from the database
# print("\n - - -- - - - - - - - ")
# for i in metadata_list:
# print("***",i["title"])
# print("- - -- - - - - - - - \n")
type_id = NodeType.objects.get(name='Document').id type_id = NodeType.objects.get(name='Document').id
langages_cache = LanguagesCache() langages_cache = LanguagesCache()
user_id = self.user.id user_id = self.user.id
...@@ -208,7 +205,6 @@ class Node(CTENode): ...@@ -208,7 +205,6 @@ class Node(CTENode):
# mark the resources as parsed for this node # mark the resources as parsed for this node
self.node_resource.update(parsed=True) self.node_resource.update(parsed=True)
@current_app.task(filter=task_method) @current_app.task(filter=task_method)
def extract_ngrams(self, keys, ngramsextractorscache=None, ngramscaches=None): def extract_ngrams(self, keys, ngramsextractorscache=None, ngramscaches=None):
# if there is no cache... # if there is no cache...
...@@ -233,6 +229,9 @@ class Node(CTENode): ...@@ -233,6 +229,9 @@ class Node(CTENode):
for ngram in extractor.extract_ngrams(self.metadata[key]): for ngram in extractor.extract_ngrams(self.metadata[key]):
terms = ' '.join([token for token, tag in ngram]) terms = ' '.join([token for token, tag in ngram])
associations[terms] += 1 associations[terms] += 1
# import pprint
# pprint.pprint(associations)
#print(associations) #print(associations)
# insert the occurrences in the database # insert the occurrences in the database
# print(associations.items()) # print(associations.items())
...@@ -282,6 +281,150 @@ class Node(CTENode): ...@@ -282,6 +281,150 @@ class Node(CTENode):
self.metadata['Processing'] = 0 self.metadata['Processing'] = 0
self.save() self.save()
def parse_resources__MOV(self, verbose=False):
# parse all resources into a list of metadata
metadata_list = []
print("not parsed resources:")
print(self.node_resource.filter(parsed=False))
print("= = = = = = = = = = =\n")
for node_resource in self.node_resource.filter(parsed=False):
resource = node_resource.resource
parser = defaultdict(lambda:FileParser.FileParser, {
'istext' : ISText,
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
'europress_french' : EuropressFileParser,
'europress_english' : EuropressFileParser,
})[resource.type.name]()
metadata_list += parser.parse(str(resource.file))
self.node_resource.update(parsed=True) #writing to DB
return metadata_list
def writeMetadata__MOV(self, metadata_list=None , verbose=False):
type_id = NodeType.objects.get(name='Document').id
user_id = self.user.id
langages_cache = LanguagesCache()
# # insert the new resources in the database!
for i, metadata_values in enumerate(metadata_list):
name = metadata_values.get('title', '')[:200]
language = langages_cache[metadata_values['language_iso2']] if 'language_iso2' in metadata_values else None,
if isinstance(language, tuple):
language = language[0]
Node(
user_id = user_id,
type_id = type_id,
name = name,
parent = self,
language_id = language.id if language else None,
metadata = metadata_values
).save()
metadata_list[i]["thelang"] = language
# # make metadata filterable
self.children.all().make_metadata_filterable()
# # mark the resources as parsed for this node
self.node_resource.update(parsed=True)
def extract_ngrams__MOV(self, array , keys , ngramsextractorscache=None, ngramscaches=None):
if ngramsextractorscache is None:
ngramsextractorscache = NgramsExtractorsCache()
langages_cache = LanguagesCache()
if ngramscaches is None:
ngramscaches = NgramsCaches()
for metadata in array:
associations = defaultdict(float) # float or int?
language = langages_cache[metadata['language_iso2']] if 'language_iso2' in metadata else None,
if isinstance(language, tuple):
language = language[0]
metadata["thelang"] = language
extractor = ngramsextractorscache[language]
ngrams = ngramscaches[language]
# print("\t\t number of req keys:",len(keys)," AND isdict?:",isinstance(keys, dict))
if isinstance(keys, dict):
for key, weight in keys.items():
if key in metadata:
for ngram in extractor.extract_ngrams(metadata[key]):
terms = ' '.join([token for token, tag in ngram])
associations[ngram] += weight
else:
for key in keys:
if key in metadata:
# print("the_content:[[[[[[__",metadata[key],"__]]]]]]")
for ngram in extractor.extract_ngrams(metadata[key]):
terms = ' '.join([token for token, tag in ngram])
associations[terms] += 1
if len(associations.items())>0:
Node_Ngram.objects.bulk_create([
Node_Ngram(
node = self,
ngram = ngrams[ngram_text],
weight = weight
)
for ngram_text, weight in associations.items()
])
# for ngram_text, weight in associations.items():
# print("ngram_text:",ngram_text," | weight:",weight, " | ngrams[ngram_text]:",ngrams[ngram_text])
def runInParallel(self, *fns):
proc = []
for fn in fns:
p = Process(target=fn)
p.start()
proc.append(p)
for p in proc:
p.join()
def workflow__MOV(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
import time
total = 0
self.metadata['Processing'] = 1
self.save()
print("LOG::TIME: In workflow() parse_resources__MOV()")
start = time.time()
theMetadata = self.parse_resources__MOV()
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources()__MOV [s]",(end - start))
print("LOG::TIME: In workflow() writeMetadata__MOV()")
start = time.time()
self.writeMetadata__MOV( metadata_list=theMetadata )
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" writeMetadata__MOV() [s]",(end - start))
print("LOG::TIME: In workflow() extract_ngrams__MOV()")
start = time.time()
self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] )
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams__MOV() [s]",(end - start))
# # this is not working
# self.runInParallel( self.writeMetadata__MOV( metadata_list=theMetadata ) , self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] ) )
start = time.time()
print("LOG::TIME: In workflow() do_tfidf()")
from analysis.functions import do_tfidf
do_tfidf(self)
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
# # print("LOG::TIME: In workflow() / do_tfidf()")
print("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" In workflow() END")
self.metadata['Processing'] = 0
self.save()
class Node_Metadata(models.Model): class Node_Metadata(models.Model):
node = models.ForeignKey(Node, on_delete=models.CASCADE) node = models.ForeignKey(Node, on_delete=models.CASCADE)
metadata = models.ForeignKey(Metadata) metadata = models.ForeignKey(Metadata)
......
...@@ -2,3 +2,4 @@ from .RisFileParser import RisFileParser ...@@ -2,3 +2,4 @@ from .RisFileParser import RisFileParser
from .IsiFileParser import IsiFileParser from .IsiFileParser import IsiFileParser
from .PubmedFileParser import PubmedFileParser from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser import EuropressFileParser from .EuropressFileParser import EuropressFileParser
from .ISText import ISText
...@@ -29,6 +29,8 @@ class NgramsExtractor: ...@@ -29,6 +29,8 @@ class NgramsExtractor:
""" """
def extract_ngrams(self, contents): def extract_ngrams(self, contents):
tagged_ngrams = self.tagger.tag_text(contents) tagged_ngrams = self.tagger.tag_text(contents)
if len(tagged_ngrams)==0: return []
grammar = nltk.RegexpParser(self._rule) grammar = nltk.RegexpParser(self._rule)
result = [] result = []
# try: # try:
......
...@@ -133,6 +133,7 @@ def doTheQuery(request , project_id): ...@@ -133,6 +133,7 @@ def doTheQuery(request , project_id):
try: try:
if DEBUG is True: if DEBUG is True:
corpus.workflow() corpus.workflow()
# corpus.workflow__MOV()
else: else:
corpus.workflow.apply_async((), countdown=3) corpus.workflow.apply_async((), countdown=3)
...@@ -205,7 +206,7 @@ def testISTEX(request , project_id): ...@@ -205,7 +206,7 @@ def testISTEX(request , project_id):
corpus.save() corpus.save()
print("DEBUG:",DEBUG)
# do the WorkFlow # do the WorkFlow
try: try:
if DEBUG is True: if DEBUG is True:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment