[UPDATE] in theory, pubmedquerier OK

d40ede9a · PkSM3 · 58e9bb2b · d40ede9a · d40ede9a
Commit d40ede9a authored Mar 25, 2015 by PkSM3
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 44 deletions

corpustools.py parsing/corpustools.py +1 -31

views.py scrap_pubmed/views.py +13 -13

No files found.
--- a/parsing/corpustools.py
+++ b/parsing/corpustools.py
@@ -206,23 +206,18 @@ def extract_ngrams(corpus, keys):
        .filter(Node.type_id == cache.NodeType['Document'].id)
    )
    # prepare data to be inserted
-    print("\n= = = = = =")
    dbg.show('find ngrams')
-    print('000001')
    languages_by_id = {
        language.id: language.iso2
        for language in session.query(Language)
    }
-    print('000002')
    ngrams_data = set()
    ngrams_language_data = set()
    ngrams_tag_data = set()
-    print('000003')
    node_ngram_list = defaultdict(lambda: defaultdict(int))
    for nodeinfo in metadata_query:
-        print('\t000004')
        node_id = nodeinfo[0]
        language_id = nodeinfo[1]
@@ -233,27 +228,13 @@ def extract_ngrams(corpus, keys):
        if language_iso2 is None:
            continue
-        print('\t000005')
-        print('\t',language_iso2)
        ngramsextractor = ngramsextractors[language_iso2]
-        print('\t',ngramsextractor)
-        print('\t000006')
        for text in nodeinfo[2:]:
            if text is not None and len(text):
-                print('\t\t000007')
                ngrams = ngramsextractor.extract_ngrams(text.replace("[","").replace("]",""))
-                # print(ngrams)
-                print('\t\t000008')
                for ngram in ngrams:
-                    print('\t\t\t000009')
-                    print('\t\t\t',ngram)
                    n = len(ngram)
-                    print('\t\t\tn:',n)
-                    print('\t\t\t000010')
                    terms    = ' '.join([token for token, tag in ngram]).lower()
-                    print('\t\t\t000011')
-                    import pprint
-                    pprint.pprint(cache.Tag)
                    # TODO BUG here
                    if n == 1:
                        tag_id   = cache.Tag[ngram[0][1]].id
@@ -263,20 +244,11 @@ def extract_ngrams(corpus, keys):
                        tag_id   = cache.Tag['NN'].id
                        #tag_id   =  14
                        #print('tag_id_2', tag_id)
-                    print('\t\t\t000012')
                    node_ngram_list[node_id][terms] += 1
-                    print('\t\t\t000013')
                    ngrams_data.add((n, terms))
-                    print('\t\t\t000014')
                    ngrams_language_data.add((terms, language_id))
-                    print('\t\t\t000015')
                    ngrams_tag_data.add((terms, tag_id))
-                    print('\t\t\t000016')
-                print('\t\t000018')
-        print('\t\t000019')
-        # dbg.show('\t000007')
-    print('000020')
    # insert ngrams to temporary table
    dbg.show('find ids for the %d ngrams' % len(ngrams_data))
    db, cursor = get_cursor()
@@ -347,8 +319,6 @@ def extract_ngrams(corpus, keys):
    # commit to database
    db.commit()
-    print("= = = = = =\n")
 # tfidf calculation
 def compute_tfidf(corpus):

--- a/scrap_pubmed/views.py
+++ b/scrap_pubmed/views.py
@@ -164,19 +164,19 @@ def doTheQuery(request , project_id):
 		if dwnldsOK == 0: return JsonHttpResponse(["fail"])
-		try: parse_resources(corpus)
+		try:
-		except Exception as error: print("!OK parse:",error)
+			def apply_workflow(corpus):
+				parse_resources(corpus)
-		try: extract_ngrams(corpus, ['title'])
+				extract_ngrams(corpus, ['title'])
-		except Exception as error: print("!OK ngrams:",error)
+				compute_tfidf(corpus)
+			if DEBUG:
-		# try: compute_tfidf(corpus)
+				apply_workflow(corpus)
-		# except Exception as error: print("!OK tfidf:",error)
+			else:
+				thread = threading.Thread(target=apply_workflow, args=(corpus, ), daemon=True)
-		# # except Exception as error:
+				thread.start()
-		# # 	print('WORKFLOW ERROR')
+		except Exception as error:
-		# # 	print(error)
+			print('WORKFLOW ERROR')
-		# # # redirect to the main project page
+			print(error)
 		return HttpResponseRedirect('/project/' + str(project_id))
 	data = alist