Commit d40ede9a authored by PkSM3's avatar PkSM3

[UPDATE] in theory, pubmedquerier OK

parent 58e9bb2b
...@@ -206,23 +206,18 @@ def extract_ngrams(corpus, keys): ...@@ -206,23 +206,18 @@ def extract_ngrams(corpus, keys):
.filter(Node.type_id == cache.NodeType['Document'].id) .filter(Node.type_id == cache.NodeType['Document'].id)
) )
# prepare data to be inserted # prepare data to be inserted
print("\n= = = = = =")
dbg.show('find ngrams') dbg.show('find ngrams')
print('000001')
languages_by_id = { languages_by_id = {
language.id: language.iso2 language.id: language.iso2
for language in session.query(Language) for language in session.query(Language)
} }
print('000002')
ngrams_data = set() ngrams_data = set()
ngrams_language_data = set() ngrams_language_data = set()
ngrams_tag_data = set() ngrams_tag_data = set()
print('000003')
node_ngram_list = defaultdict(lambda: defaultdict(int)) node_ngram_list = defaultdict(lambda: defaultdict(int))
for nodeinfo in metadata_query: for nodeinfo in metadata_query:
print('\t000004')
node_id = nodeinfo[0] node_id = nodeinfo[0]
language_id = nodeinfo[1] language_id = nodeinfo[1]
...@@ -233,27 +228,13 @@ def extract_ngrams(corpus, keys): ...@@ -233,27 +228,13 @@ def extract_ngrams(corpus, keys):
if language_iso2 is None: if language_iso2 is None:
continue continue
print('\t000005')
print('\t',language_iso2)
ngramsextractor = ngramsextractors[language_iso2] ngramsextractor = ngramsextractors[language_iso2]
print('\t',ngramsextractor)
print('\t000006')
for text in nodeinfo[2:]: for text in nodeinfo[2:]:
if text is not None and len(text): if text is not None and len(text):
print('\t\t000007')
ngrams = ngramsextractor.extract_ngrams(text.replace("[","").replace("]","")) ngrams = ngramsextractor.extract_ngrams(text.replace("[","").replace("]",""))
# print(ngrams)
print('\t\t000008')
for ngram in ngrams: for ngram in ngrams:
print('\t\t\t000009')
print('\t\t\t',ngram)
n = len(ngram) n = len(ngram)
print('\t\t\tn:',n)
print('\t\t\t000010')
terms = ' '.join([token for token, tag in ngram]).lower() terms = ' '.join([token for token, tag in ngram]).lower()
print('\t\t\t000011')
import pprint
pprint.pprint(cache.Tag)
# TODO BUG here # TODO BUG here
if n == 1: if n == 1:
tag_id = cache.Tag[ngram[0][1]].id tag_id = cache.Tag[ngram[0][1]].id
...@@ -263,20 +244,11 @@ def extract_ngrams(corpus, keys): ...@@ -263,20 +244,11 @@ def extract_ngrams(corpus, keys):
tag_id = cache.Tag['NN'].id tag_id = cache.Tag['NN'].id
#tag_id = 14 #tag_id = 14
#print('tag_id_2', tag_id) #print('tag_id_2', tag_id)
print('\t\t\t000012')
node_ngram_list[node_id][terms] += 1 node_ngram_list[node_id][terms] += 1
print('\t\t\t000013')
ngrams_data.add((n, terms)) ngrams_data.add((n, terms))
print('\t\t\t000014')
ngrams_language_data.add((terms, language_id)) ngrams_language_data.add((terms, language_id))
print('\t\t\t000015')
ngrams_tag_data.add((terms, tag_id)) ngrams_tag_data.add((terms, tag_id))
print('\t\t\t000016')
print('\t\t000018')
print('\t\t000019')
# dbg.show('\t000007')
print('000020')
# insert ngrams to temporary table # insert ngrams to temporary table
dbg.show('find ids for the %d ngrams' % len(ngrams_data)) dbg.show('find ids for the %d ngrams' % len(ngrams_data))
db, cursor = get_cursor() db, cursor = get_cursor()
...@@ -347,8 +319,6 @@ def extract_ngrams(corpus, keys): ...@@ -347,8 +319,6 @@ def extract_ngrams(corpus, keys):
# commit to database # commit to database
db.commit() db.commit()
print("= = = = = =\n")
# tfidf calculation # tfidf calculation
def compute_tfidf(corpus): def compute_tfidf(corpus):
......
...@@ -164,19 +164,19 @@ def doTheQuery(request , project_id): ...@@ -164,19 +164,19 @@ def doTheQuery(request , project_id):
if dwnldsOK == 0: return JsonHttpResponse(["fail"]) if dwnldsOK == 0: return JsonHttpResponse(["fail"])
try: parse_resources(corpus) try:
except Exception as error: print("!OK parse:",error) def apply_workflow(corpus):
parse_resources(corpus)
try: extract_ngrams(corpus, ['title']) extract_ngrams(corpus, ['title'])
except Exception as error: print("!OK ngrams:",error) compute_tfidf(corpus)
if DEBUG:
# try: compute_tfidf(corpus) apply_workflow(corpus)
# except Exception as error: print("!OK tfidf:",error) else:
thread = threading.Thread(target=apply_workflow, args=(corpus, ), daemon=True)
# # except Exception as error: thread.start()
# # print('WORKFLOW ERROR') except Exception as error:
# # print(error) print('WORKFLOW ERROR')
# # # redirect to the main project page print(error)
return HttpResponseRedirect('/project/' + str(project_id)) return HttpResponseRedirect('/project/' + str(project_id))
data = alist data = alist
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment