Commit 5f610771 authored by c24b's avatar c24b

MERGE OK [patch] romain-stable-patch + c24b-stable-patch

parents db1b31a2 e5d4e175
......@@ -27,7 +27,7 @@ class ISTexParser(Parser):
}
suma = 0
print(len(json_docs))
for json_doc in json_docs:
hyperdata = {}
......@@ -92,9 +92,9 @@ class ISTexParser(Parser):
hyperdata["language_iso3"] = "eng"
# (cf. api.istex.fr/document/?q=*&facet=language
# et tests langid sur les language=["unknown"])
#just to be sure
hyperdata = self.format_hyperdata_languages(hyperdata)
if "publication_date" in hyperdata:
RealDate = hyperdata["publication_date"]
if "publication_date" in hyperdata:
......
......@@ -62,7 +62,8 @@ def parse_extract_indexhyperdata(corpus):
# apply actions
print('CORPUS #%d' % (corpus.id))
parse(corpus)
print('CORPUS #%d: parsed' % (corpus.id))
docs = corpus.children("DOCUMENT").count()
print('CORPUS #%d: parsed %d' % (corpus.id, docs))
extract_ngrams(corpus)
# Preparing Databse
......
......@@ -47,99 +47,76 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
resource = corpus.resources()[0]
documents_count = 0
source = get_resource(resource["type"])
# preload available taggers for corpus languages
tagger_bots = {}
skipped_languages = {}
for lang in corpus.hyperdata['languages']:
try:
tagger_bots[lang] = load_tagger(lang)()
except KeyError:
skipped_languages[lang] = True
print("WARNING skipping language:", lang)
# the list of todo docs
docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.hyperdata['skipped_docs']]
# go for the loop
#load only the docs that have passed the parsing without error
docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.hyperdata["skipped_docs"]]
#load available taggers for source default langage
tagger_bots = {lang: load_tagger(lang)() for lang in corpus.hyperdata["languages"] if lang != "__skipped__"}
#sort docs by lang?
# for lang, tagger in tagger_bots.items():
for documents_count, document in enumerate(docs):
language_iso2 = document.hyperdata.get('language_iso2')
#print(language_iso2)
# skip case if no tagger available
if language_iso2 in skipped_languages:
corpus.hyperdata['skipped_docs'][document.id] = True
if language_iso2 in source["default_languages"]:
#filtering out skipped_docs of parsing not necessary in here filtered out in docs???
#if document.id not in corpus.skipped_docs:
tagger = tagger_bots[language_iso2]
#print(language_iso2)
#>>> romain-stable-patch
#to do verify if document has no KEYS to index
for key in keys:
try:
value = document.hyperdata[str(key)]
if not isinstance(value, str):
print("DBG wrong content in doc for key", key)
continue
# get ngrams
for ngram in tagger.extract(value):
tokens = tuple(normalize_forms(token[0]) for token in ngram)
if do_subngrams:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],
# ['very', 'cool', 'exemple'],
# ['cool', 'exemple']]
subterms = subsequences(tokens)
else:
subterms = [tokens]
for seqterm in subterms:
ngram = ' '.join(seqterm)
if len(ngram) > 1:
# doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1
# add fields : terms n
ngrams_data.add((ngram[:255], len(seqterm), ))
except:
#value not in doc
pass
# except AttributeError:
# print("ERROR NO language_iso2")
# document.status("NGRAMS", error="No lang detected skipped Ngrams")
# corpus.skipped_docs.append(document.id)
# integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0:
corpus.status('Ngrams', progress=documents_count+1)
corpus.save_hyperdata()
document.hyperdata["error"] = "Error: unsupported language"
document.save_hyperdata()
session.add(corpus)
session.commit()
continue
# NORMAL CASE
tagger = tagger_bots[language_iso2]
for key in keys:
key = str(key)
if key not in document.hyperdata:
# print("DBG missing key in doc", key)
# TODO test if document has no keys at all
continue
# get a text value
value = document[key]
if not isinstance(value, str):
print("DBG wrong content in doc for key", key)
continue
try:
# get ngrams
ngrams = tagger.extract(value)
for ngram in ngrams:
tokens = tuple(normalize_forms(token[0]) for token in ngram)
if do_subngrams:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],...]
subterms = subsequences(tokens)
else:
subterms = [tokens]
for seqterm in subterms:
ngram = ' '.join(seqterm)
if len(ngram) > 1:
# doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1
# add fields : terms n
ngrams_data.add((ngram[:255], len(seqterm), ))
except Exception as e:
print('NGRAMS EXTRACTION skipping doc %i because of unknown error:' % document.id, str(e))
# TODO add info to document.hyperdata['error']
pass
# integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0:
corpus.status('Ngrams', progress=documents_count+1)
corpus.save_hyperdata()
session.add(corpus)
session.commit()
# integrate ngrams and nodes-ngrams (le reste)
if len(nodes_ngrams_count) > 0:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
# integrate ngrams and nodes-ngrams (le reste)
if len(nodes_ngrams_count) > 0:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
corpus.hyperdata['skipped_languages'] = skipped_languages
corpus.save_hyperdata()
corpus.status('Ngrams', progress=documents_count+1, complete=True)
corpus.save_hyperdata()
session.commit()
corpus.status('Ngrams', progress=documents_count+1, complete=True)
corpus.save_hyperdata()
session.commit()
except Exception as error:
corpus.status('Ngrams', error=error)
......
This diff is collapsed.
......@@ -8,7 +8,7 @@ from traceback import print_tb
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import get_resource_by_name, QUERY_SIZE_N_MAX
from gargantext.constants import get_resource, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse
......@@ -16,7 +16,7 @@ from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
from moissonneurs.util import Scraper
RESOURCE_TYPE_ISTEX = 8
def query( request ):
......@@ -85,7 +85,7 @@ def save(request , project_id):
query = "-"
query_string = "-"
N = QUERY_SIZE_N_MAX
#N = QUERY_SIZE_N_MAX
if "query" in request.POST:
query = request.POST["query"]
......@@ -96,10 +96,12 @@ def save(request , project_id):
N = QUERY_SIZE_N_MAX
else:
N = int(request.POST["N"]) # query_size from views_opti
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR (scrap: istex d/l ): ",msg)
raise ValueError(msg)
N = QUERY_SIZE_N_MAX
#msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
#print("ERROR (scrap: istex d/l ): ",msg)
#raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N))
......@@ -107,6 +109,7 @@ def save(request , project_id):
pagesize = 50
tasks = Scraper()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
......@@ -131,6 +134,7 @@ def save(request , project_id):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
tasks.q.put( url ) #put a task in th queue
tasks.q.join() # wait until everything is finished
......@@ -140,21 +144,21 @@ def save(request , project_id):
if filename!=False:
# add the uploaded resource to the corpus
corpus.add_resource(
type = get_resource_by_name('ISTex')["type"]
type = get_resource(RESOURCE_TYPE_ISTEX)["type"]
, path = filename
)
dwnldsOK+=1
session.add(corpus)
session.commit()
corpus_id = corpus.id
#corpus_id = corpus.id
if dwnldsOK == 0 :
return JsonHttpResponse(["fail"])
###########################
###########################
try:
scheduled(parse_extract_indexhyperdata)(corpus_id)
scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
......@@ -178,4 +182,5 @@ def save(request , project_id):
data = [query_string,query,N]
print(data)
return JsonHttpResponse(data)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment