Commit bc8107f6 authored by c24b's avatar c24b

[PATCH] resource_loop + BATCH NGRAMS EXTRACTION + Lang

parent ff681f29
......@@ -27,7 +27,7 @@ class ISTexParser(Parser):
}
suma = 0
print(len(json_docs))
for json_doc in json_docs:
hyperdata = {}
......@@ -92,9 +92,9 @@ class ISTexParser(Parser):
hyperdata["language_iso3"] = "eng"
# (cf. api.istex.fr/document/?q=*&facet=language
# et tests langid sur les language=["unknown"])
#just to be sure
hyperdata = self.format_hyperdata_languages(hyperdata)
if "publication_date" in hyperdata:
RealDate = hyperdata["publication_date"]
if "publication_date" in hyperdata:
......
......@@ -62,7 +62,8 @@ def parse_extract_indexhyperdata(corpus):
# apply actions
print('CORPUS #%d' % (corpus.id))
parse(corpus)
print('CORPUS #%d: parsed' % (corpus.id))
docs = corpus.children("DOCUMENT").count()
print('CORPUS #%d: parsed %d' % (corpus.id, docs))
extract_ngrams(corpus)
# Preparing Databse
......
......@@ -6,38 +6,112 @@ from collections import defaultdict, Counter
from re import sub
from gargantext.util.languages import languages, detect_lang
def add_lang(languages, hyperdata, skipped_languages):
'''utility to add lang information
1. on language_iso2
2. on other format language_%f
3. on text from concatenation of DEFAULT_INDEX_FIELDS
'''
if "language_iso2" in hyperdata.keys():
try:
languages[hyperdata["language_iso2"]] +=1
return languages,hyperdata, skipped_languages
except KeyError:
hyperdata["error"] = "Error: unsupported language %s" %hyperdata["language_iso2"]
skipped_languages.append(hyperdata["language_iso2"])
return languages,hyperdata, skipped_languages
# this should be the responsability of the parserbot
elif "language_iso3" in hyperdata.keys():
#convert
try:
lang = languages[hyperdata["language_iso3"]].iso2
try:
corpus.languages[lang] +=1
return languages,hyperdata, skipped_languages
except KeyError:
hyperdata["error"] = "Error: unsupported language %s" %lang
skipped_languages.append(lang)
return languages,hyperdata, skipped_languages
except KeyError:
print ("LANG not referenced", (hyperdata["language_iso3"]))
#skipped_languages.append(hyperdata["language_iso3"])
#hyperdata["error"] = "Error: unsupported language '%s'" %hyperdata["language_fullname"]
return languages,hyperdata, skipped_languages
elif "language_fullname" in hyperdata.keys():
try:
#convert
lang = languages[hyperdata["language_fullname"]].iso2
try:
corpus.languages[lang] +=1
return corpus, hyperdata, skipped_languages
except KeyError:
hyperdata["error"] = "Error: unsupported language %s" %lang
skipped_languages.append(lang)
return languages,hyperdata, skipped_languages
except KeyError:
print ("LANG Not referenced", (hyperdata["language_fullname"]))
#hyperdata["error"] = "Error: unsupported language '%s'" %hyperdata["language_fullname"]
return languages,hyperdata, skipped_languages
else:
print("[WARNING] no language_* found in document [parsing.py]")
#no language have been indexed
#detectlang by index_fields
text = " ".join([getattr(hyperdata, k) for k in DEFAULT_INDEX_FIELDS])
if len(text) < 10:
hyperdata["error"] = "Error: no TEXT fields to index"
skipped_languages.append("__unknown__")
return languages,hyperdata, skipped_languages
#detect_lang return iso2
lang = detect_lang(text)
try:
languages[lang] += 1
return languages,hyperdata, skipped_languages
except KeyError:
hyperdata["error"] = "Error: unsupported language '%s'" %lang
skipped_languages.append(lang)
return languages,hyperdata, skipped_languages
def parse(corpus):
try:
documents_count = 0
print("PARSING")
corpus.status('Docs', progress=0)
#1 corpus => 1 resource
#1 corpus => 1 or multi resources.path (for crawlers)
resources = corpus.resources()
#get the sources capabilities for a given corpus resource
sources = [get_resource(resource["type"]) for resource in corpus.resources() if resource["extracted"] is False]
if len(sources) == 0:
#>>> documents have already been parsed?????
if len(resources) == 0:
return
if len(sources) > 0:
#>>> necessairement 1 corpus = 1 source dans l'archi actuelle
source = sources[0]
resource = resources[0]
#source.extend(resource)
if source["parser"] is None:
#corpus.status(error)
raise ValueError("Resource '%s' has no Parser" %resource["name"])
#all the resources are of the same type for now
source = get_resource(resources[0]["type"])
#get the sources capabilities for a given corpus resource
#load the corresponding parserbot
if source["parser"] is None:
#corpus.status(error)
raise ValueError("Resource '%s' has no Parser" %resource["name"])
parserbot = load_parser(source)
#observed languages in default languages
languages = defaultdict.fromkeys(source["default_languages"], 0)
#skipped_languages
skipped_languages = []
#skipped docs to remember for later processing
skipped_docs = []
#BY RESOURCE
for i,resource in enumerate(resources):
if resource["extracted"] is True:
continue
else:
#observed langages in corpus docs
corpus.languages = defaultdict.fromkeys(source["default_languages"], 0)
#remember the skipped docs in parsing
skipped_languages = []
corpus.skipped_docs = []
session.add(corpus)
session.commit()
#load the corresponding parser
parserbot = load_parser(source)
# extract and insert documents from resource.path into database
default_lang_field = ["language_"+l for l in ["iso2", "iso3", "full_name"]]
# BY documents
d = 0
for hyperdata in parserbot(resource["path"]):
# indexed text fields defined in CONSTANTS
for k in DEFAULT_INDEX_FIELDS:
......@@ -46,82 +120,70 @@ def parse(corpus):
hyperdata[k] = normalize_chars(hyperdata[k])
except Exception as error :
hyperdata["error"] = "Error normalize_chars"
#any parser should implement a language_iso2
if "language_iso2" in hyperdata.keys():
try:
corpus.languages[hyperdata["language_iso2"]] +=1
except KeyError:
hyperdata["error"] = "Error: unsupported language"
skipped_languages.append(hyperdata["language_iso2"])
# this should be the responsability of the parserbot
# elif "language_iso3" in hyperdata.keys():
# try:
# corpus.languages[languages(hyperdata["language_iso2"]).iso2] +=1
# except KeyError:
# hyperdata["error"] = "Error: unsupported language"
# skipped_languages.append(hyperdata["language_iso2"])
else:
print("[WARNING] no language_iso2 found in document [parsing.py]")
#no language have been indexed
#detectlang by index_fields
text = " ".join([getattr(hyperdata, k) for k in DEFAULT_INDEX_FIELDS])
if len(text) < 10:
hyperdata["error"] = "Error: no TEXT fields to index"
skipped_languages.append("__unknown__")
hyperdata["language_iso2"] = detect_lang(text)
try:
corpus.languages[hyperdata["language_iso2"]] += 1
corpus.languages[hyperdata["language_iso2"]] +=1
except KeyError:
hyperdata["error"] = "Error: unsupported language"
skipped_languages.append(hyperdata["language_iso2"])
#else:
#print("[WARNING] No %s field found in hyperdata at parsing.py" %k)
# continue
#adding lang into record hyperdata
languages, hyperdata, skipped_languages = add_lang(languages, hyperdata, skipped_languages)
# save as DB child
# ----------------
d += 1
#print ("INSERT", d)
document = corpus.add_child(
typename = 'DOCUMENT',
name = hyperdata.get('title', '')[:255],
hyperdata = hyperdata,
)
#corpus.save_hyperdata()
session.add(document)
session.commit()
if "error" in hyperdata.keys():
#document.status("error")
document.status('Parsing', error= document.hyperdata["error"])
document.save_hyperdata()
session.add(document)
session.commit()
#adding skipped_docs for later processsing
corpus.skipped_docs.append(document.id)
documents_count += 1
# logging
if documents_count % BATCH_PARSING_SIZE == 0:
corpus.status('Docs', progress=documents_count)
corpus.save_hyperdata()
session.add(corpus)
session.commit()
skipped_docs.append(document.id)
#documents for this resources
session.add(corpus)
session.commit()
# update info about the resource
resource['extracted'] = True
#print( "resource n°",i, ":", d, "docs inside this file")
# update info about the resource
resource['extracted'] = True
# add a corpus-level info about languages adding a __skipped__ info
corpus.languages['__skipped__'] = Counter(skipped_languages)
print("LANGUES")
for n in corpus.languages.items():
print(n)
#TO DO: give the main language of the corpus to unsupported lang docs
print(len(corpus.skipped_docs), "docs skipped")
# commit all changes
corpus.status('Docs', progress=documents_count, complete=True)
print(len(skipped_docs), "docs skipped")
print(corpus.children("DOCUMENT").count(), "docs parsed")
#main language of the corpus
print(languages.items())
corpus.language_id = sorted(languages.items(), key = lambda x: x[1], reverse=True)[0][0]
print(corpus.language_id)
languages['__skipped__'] = dict(Counter(skipped_languages))
corpus.languages = languages
corpus.skipped_docs = list(set(skipped_docs))
corpus.save_hyperdata()
session.add(corpus)
session.commit()
if len(corpus.skipped_docs) > 0:
print (sum(languages["__skipped__"].values()), "docs with unsupported lang")
#assign main lang to unsupported languages docs
for d_id in corpus.skipped_docs:
document = session.query(Node).filter(Node.id == d_id, Node.typename == "DOCUMENT").first()
document.hyperdata["language_iso2"] = corpus.language_id
document.save_hyperdata()
session.commit()
except Exception as error:
corpus.status('Docs', error=error)
corpus.save_hyperdata()
......
......@@ -8,7 +8,7 @@ from traceback import print_tb
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import get_resource_by_name, QUERY_SIZE_N_MAX
from gargantext.constants import get_resource, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse
......@@ -16,7 +16,7 @@ from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
from moissonneurs.util import Scraper
RESOURCE_TYPE_ISTEX = 8
def query( request ):
......@@ -85,7 +85,7 @@ def save(request , project_id):
query = "-"
query_string = "-"
N = QUERY_SIZE_N_MAX
#N = QUERY_SIZE_N_MAX
if "query" in request.POST:
query = request.POST["query"]
......@@ -96,10 +96,12 @@ def save(request , project_id):
N = QUERY_SIZE_N_MAX
else:
N = int(request.POST["N"]) # query_size from views_opti
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR (scrap: istex d/l ): ",msg)
raise ValueError(msg)
N = QUERY_SIZE_N_MAX
#msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
#print("ERROR (scrap: istex d/l ): ",msg)
#raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N))
......@@ -107,6 +109,7 @@ def save(request , project_id):
pagesize = 50
tasks = Scraper()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
......@@ -131,6 +134,7 @@ def save(request , project_id):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
tasks.q.put( url ) #put a task in th queue
tasks.q.join() # wait until everything is finished
......@@ -140,21 +144,21 @@ def save(request , project_id):
if filename!=False:
# add the uploaded resource to the corpus
corpus.add_resource(
type = get_resource_by_name('ISTex')["type"]
type = get_resource(RESOURCE_TYPE_ISTEX)["type"]
, path = filename
)
dwnldsOK+=1
session.add(corpus)
session.commit()
corpus_id = corpus.id
#corpus_id = corpus.id
if dwnldsOK == 0 :
return JsonHttpResponse(["fail"])
###########################
###########################
try:
scheduled(parse_extract_indexhyperdata)(corpus_id)
scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
......@@ -178,4 +182,5 @@ def save(request , project_id):
data = [query_string,query,N]
print(data)
return JsonHttpResponse(data)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment