Commit bc8107f6 authored by c24b's avatar c24b

[PATCH] resource_loop + BATCH NGRAMS EXTRACTION + Lang

parent ff681f29
...@@ -27,7 +27,7 @@ class ISTexParser(Parser): ...@@ -27,7 +27,7 @@ class ISTexParser(Parser):
} }
suma = 0 suma = 0
print(len(json_docs))
for json_doc in json_docs: for json_doc in json_docs:
hyperdata = {} hyperdata = {}
...@@ -92,9 +92,9 @@ class ISTexParser(Parser): ...@@ -92,9 +92,9 @@ class ISTexParser(Parser):
hyperdata["language_iso3"] = "eng" hyperdata["language_iso3"] = "eng"
# (cf. api.istex.fr/document/?q=*&facet=language # (cf. api.istex.fr/document/?q=*&facet=language
# et tests langid sur les language=["unknown"]) # et tests langid sur les language=["unknown"])
#just to be sure
hyperdata = self.format_hyperdata_languages(hyperdata) hyperdata = self.format_hyperdata_languages(hyperdata)
if "publication_date" in hyperdata: if "publication_date" in hyperdata:
RealDate = hyperdata["publication_date"] RealDate = hyperdata["publication_date"]
if "publication_date" in hyperdata: if "publication_date" in hyperdata:
......
...@@ -62,7 +62,8 @@ def parse_extract_indexhyperdata(corpus): ...@@ -62,7 +62,8 @@ def parse_extract_indexhyperdata(corpus):
# apply actions # apply actions
print('CORPUS #%d' % (corpus.id)) print('CORPUS #%d' % (corpus.id))
parse(corpus) parse(corpus)
print('CORPUS #%d: parsed' % (corpus.id)) docs = corpus.children("DOCUMENT").count()
print('CORPUS #%d: parsed %d' % (corpus.id, docs))
extract_ngrams(corpus) extract_ngrams(corpus)
# Preparing Databse # Preparing Databse
......
This diff is collapsed.
...@@ -8,7 +8,7 @@ from traceback import print_tb ...@@ -8,7 +8,7 @@ from traceback import print_tb
from django.shortcuts import redirect, render from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import get_resource_by_name, QUERY_SIZE_N_MAX from gargantext.constants import get_resource, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node from gargantext.models.nodes import Node
from gargantext.util.db import session from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse from gargantext.util.http import JsonHttpResponse
...@@ -16,7 +16,7 @@ from gargantext.util.scheduling import scheduled ...@@ -16,7 +16,7 @@ from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata from gargantext.util.toolchain import parse_extract_indexhyperdata
from moissonneurs.util import Scraper from moissonneurs.util import Scraper
RESOURCE_TYPE_ISTEX = 8
def query( request ): def query( request ):
...@@ -85,7 +85,7 @@ def save(request , project_id): ...@@ -85,7 +85,7 @@ def save(request , project_id):
query = "-" query = "-"
query_string = "-" query_string = "-"
N = QUERY_SIZE_N_MAX #N = QUERY_SIZE_N_MAX
if "query" in request.POST: if "query" in request.POST:
query = request.POST["query"] query = request.POST["query"]
...@@ -96,10 +96,12 @@ def save(request , project_id): ...@@ -96,10 +96,12 @@ def save(request , project_id):
N = QUERY_SIZE_N_MAX N = QUERY_SIZE_N_MAX
else: else:
N = int(request.POST["N"]) # query_size from views_opti N = int(request.POST["N"]) # query_size from views_opti
if N > QUERY_SIZE_N_MAX: if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX) N = QUERY_SIZE_N_MAX
print("ERROR (scrap: istex d/l ): ",msg) #msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
raise ValueError(msg) #print("ERROR (scrap: istex d/l ): ",msg)
#raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N)) print("Scrapping Istex: '%s' (%i)" % (query_string , N))
...@@ -107,6 +109,7 @@ def save(request , project_id): ...@@ -107,6 +109,7 @@ def save(request , project_id):
pagesize = 50 pagesize = 50
tasks = Scraper() tasks = Scraper()
chunks = list(tasks.chunks(range(N), pagesize)) chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks: for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0] if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize)) urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
...@@ -131,6 +134,7 @@ def save(request , project_id): ...@@ -131,6 +134,7 @@ def save(request , project_id):
t = threading.Thread(target=tasks.worker2) #thing to do t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits. t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start() t.start()
for url in urlreqs: for url in urlreqs:
tasks.q.put( url ) #put a task in th queue tasks.q.put( url ) #put a task in th queue
tasks.q.join() # wait until everything is finished tasks.q.join() # wait until everything is finished
...@@ -140,21 +144,21 @@ def save(request , project_id): ...@@ -140,21 +144,21 @@ def save(request , project_id):
if filename!=False: if filename!=False:
# add the uploaded resource to the corpus # add the uploaded resource to the corpus
corpus.add_resource( corpus.add_resource(
type = get_resource_by_name('ISTex')["type"] type = get_resource(RESOURCE_TYPE_ISTEX)["type"]
, path = filename , path = filename
) )
dwnldsOK+=1 dwnldsOK+=1
session.add(corpus) session.add(corpus)
session.commit() session.commit()
corpus_id = corpus.id #corpus_id = corpus.id
if dwnldsOK == 0 : if dwnldsOK == 0 :
return JsonHttpResponse(["fail"]) return JsonHttpResponse(["fail"])
########################### ###########################
########################### ###########################
try: try:
scheduled(parse_extract_indexhyperdata)(corpus_id) scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error: except Exception as error:
print('WORKFLOW ERROR') print('WORKFLOW ERROR')
print(error) print(error)
...@@ -178,4 +182,5 @@ def save(request , project_id): ...@@ -178,4 +182,5 @@ def save(request , project_id):
data = [query_string,query,N] data = [query_string,query,N]
print(data)
return JsonHttpResponse(data) return JsonHttpResponse(data)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment