Commit 49552ff6 authored by c24b's avatar c24b

PARSING with default_languages and skipped_docs added

parent cd453144
......@@ -40,26 +40,31 @@ class CernParser(Parser):
"856": {"u":"pdf_source"},
}
def format_date(self, hyperdata):
'''formatting pubdate'''
prefix = "publication"
date = datetime.strptime(hyperdata[prefix + "_date"], "%Y-%m-%d")
#hyperdata[prefix + "_year"] = date.strftime('%Y')
hyperdata[prefix + "_month"] = date.strftime("%m")
hyperdata[prefix + "_day"] = date.strftime("%d")
hyperdata[prefix + "_hour"] = date.strftime("%H")
hyperdata[prefix + "_minute"] = date.strftime("%M")
hyperdata[prefix + "_second"] = date.strftime("%S")
hyperdata[prefix + "_date"] = date.strftime("%Y-%m-%d %H:%M:%S")
print("Date", hyperdata["publication_date"])
return hyperdata
# def format_date(self, hyperdata):
# '''formatting pubdate'''
# prefix = "publication"
# try:
# date = datetime.strptime(hyperdata[prefix + "_date"], "%Y-%m-%d")
# except ValueError:
# date = datetime.strptime(hyperdata[prefix + "_date"], "%Y-%m")
# date.day = "01"
# hyperdata[prefix + "_year"] = date.strftime('%Y')
# hyperdata[prefix + "_month"] = date.strftime("%m")
# hyperdata[prefix + "_day"] = date.strftime("%d")
#
# hyperdata[prefix + "_hour"] = date.strftime("%H")
# hyperdata[prefix + "_minute"] = date.strftime("%M")
# hyperdata[prefix + "_second"] = date.strftime("%S")
# hyperdata[prefix + "_date"] = date.strftime("%Y-%m-%d %H:%M:%S")
# #print("Date", hyperdata["publication_date"])
# return hyperdata
#@asyncio.coroutine
def parse(self, file):
print("PARSING")
#print("PARSING")
hyperdata_list = []
doc = file.read()
print(doc[:35])
#print(doc[:35])
soup = BeautifulSoup(doc, "lxml")
#print(soup.find("record"))
......@@ -93,8 +98,8 @@ class CernParser(Parser):
hyperdata["authors_affiliations"] = (",").join(hyperdata["authors_affiliations"])
hyperdata["authors"] = (",").join(hyperdata["authors"])
hyperdata["authors_mails"] = (",").join(hyperdata["authors_mails"])
hyperdata = self.format_date(hyperdata)
#hyperdata = self.format_date(hyperdata)
hyperdata = self.format_hyperdata_languages(hyperdata)
hyperdata = self.format_hyperdata_dates(hyperdata)
hyperdata_list.append(hyperdata)
return hyperdata_list
......@@ -9,7 +9,7 @@ import nltk
class Tagger:
def __init__(self, text):
def __init__(self):
# This regular expression is really good at tokenizing a text!
self._re_sentence = re.compile(r'''(?x) # set flag to allow verbose regexps
(?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
......@@ -19,18 +19,18 @@ class Tagger:
| [][.,;"'?!():-_`] # these are separate tokens
''', re.UNICODE | re.MULTILINE | re.DOTALL)
self.buffer = []
self.text = clean_text(text)
self.start()
#self.start()
def clean_text(text):
def clean_text(self, text):
"""Clean the text for better POS tagging.
For now, only removes (short) XML tags.
"""
return re.sub(r'<[^>]{0,45}>', '', text)
def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
text = self.clean_text(text)
self.text = self.clean_text(text)
grammar = nltk.RegexpParser(label + ': ' + rule)
tagged_tokens = list(self.tag_text(self.text))
if len(tagged_tokens):
......
......@@ -82,6 +82,7 @@ def parse_extract_indexhyperdata(corpus):
favs = corpus.add_child(
typename='FAVORITES', name='favorite docs in "%s"' % corpus.name
)
session.add(favs)
session.commit()
print('CORPUS #%d: [%s] new favorites node #%i' % (corpus.id, t(), favs.id))
......@@ -265,7 +266,7 @@ def recount(corpus):
# -> specclusion/genclusion: compute + write (=> NodeNodeNgram)
(spec_id, gen_id) = compute_specgen(corpus, cooc_matrix = coocs,
spec_overwrite_id = old_spec_id,
spec_overwrite_id = old_spec_id,
gen_overwrite_id = old_gen_id)
print('RECOUNT #%d: [%s] updated spec-clusion node #%i' % (corpus.id, t(), spec_id))
......
......@@ -22,17 +22,13 @@ def prepare_stemmers(corpus):
"""
Returns *several* stemmers (one for each language in the corpus)
(as a dict of stemmers with key = language_iso2)
languages has been previously filtered by supported source languages
and formatted
"""
stemmers_by_lg = {
# always get a generic stemmer in case language code unknown
'__unknown__' : SnowballStemmer("english")
}
for lang in corpus.languages.keys():
print(lang)
if (lang != '__skipped__'):
lgname = languages[lang].name.lower()
stemmers_by_lg[lang] = SnowballStemmer(lgname)
return stemmers_by_lg
stemmers = {lang:SnowballStemmer(languages[lang].name.lower()) for lang \
in corpus.languages.keys() if lang !="__skipped__"}
stemmers['__unknown__'] = SnowballStemmer("english")
return stemmers
def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
"""
......@@ -40,7 +36,6 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
2) Create an empty GROUPLIST node (for a list of "synonym" ngrams)
3) Save the list to DB (list node + each grouping as listnode - ngram1 - ngram2)
"""
print(corpus.languages.keys())
stop_ngrams_ids = {}
# we will need the ngrams of the stoplist to filter
......@@ -60,16 +55,17 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
my_groups = defaultdict(Counter)
# preloop per doc to sort ngrams by language
for doc in corpus.children():
if ('language_iso2' in doc.hyperdata):
lgid = doc.hyperdata['language_iso2']
else:
lgid = "__unknown__"
# doc.ngrams is an sql query (ugly but useful intermediate step)
# FIXME: move the counting and stoplist filtering up here
for ngram_pack in doc.ngrams.all():
todo_ngrams_per_lg[lgid].add(ngram_pack)
for doc in corpus.children('DOCUMENT'):
if doc.id not in corpus.skipped_docs:
if ('language_iso2' in doc.hyperdata):
lgid = doc.hyperdata['language_iso2']
else:
lgid = "__unknown__"
# doc.ngrams is an sql query (ugly but useful intermediate step)
# FIXME: move the counting and stoplist filtering up here
for ngram_pack in doc.ngrams.all():
todo_ngrams_per_lg[lgid].add(ngram_pack)
# --------------------
# long loop per ngrams
......
......@@ -3,7 +3,6 @@ from gargantext.models import *
from gargantext.constants import *
from collections import defaultdict
from re import sub
from gargantext.util.scheduling import scheduled
def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
......@@ -44,51 +43,50 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
db, cursor = get_cursor()
nodes_ngrams_count = defaultdict(int)
ngrams_data = set()
# extract ngrams
#1 corpus = 1 resource
resource = corpus.resources()[0]
source = get_resource(resource["type"])
documents_count = 0
source = get_resource(resource["type"])
#load available taggers for source default langage
#skipped documents that have been skipped previously for parsing error or unsupported language
tagger_bots = {lang: load_tagger(lang) for lang in corpus.languages if lang != "__skipped__"}
docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.skipped_docs]
tagger_bots = {lang: load_tagger(lang)() for lang in corpus.languages if lang != "__skipped__"}
#sort docs by lang?
for documents_count, document in enumerate(docs):
try:
lang_doc = document.hyperdata["language_iso2"]
except AttributeError:
print("NO LANG DETECTED")
document.status("NGRAMS", error="No lang detected?")
corpus.skipped_docs.append(document.id)
continue
for key in keys:
value = document.get(key, None)
print("VAL", value)
if not isinstance(value, str):
continue
# get ngrams
for ngram in tagger_bots[lang_doc](value):
tokens = tuple(normalize_forms(token[0]) for token in ngram)
print("tk", tokens)
if do_subngrams:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],
# ['very', 'cool', 'exemple'],
# ['cool', 'exemple']]
subterms = subsequences(tokens)
else:
subterms = [tokens]
for seqterm in subterms:
ngram = ' '.join(seqterm)
if len(ngram) > 1:
# doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1
# add fields : terms n
ngrams_data.add((ngram[:255], len(seqterm), ))
for lang, tagger in tagger_bots.items():
for documents_count, document in enumerate(docs):
language_iso2 = document.hyperdata.get('language_iso2', lang)
#print(language_iso2)
for key in keys:
try:
value = document[str(key)]
if not isinstance(value, str):
continue
# get ngrams
for ngram in tagger.extract(value):
tokens = tuple(normalize_forms(token[0]) for token in ngram)
if do_subngrams:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],
# ['very', 'cool', 'exemple'],
# ['cool', 'exemple']]
subterms = subsequences(tokens)
else:
subterms = [tokens]
for seqterm in subterms:
ngram = ' '.join(seqterm)
if len(ngram) > 1:
# doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1
# add fields : terms n
ngrams_data.add((ngram[:255], len(seqterm), ))
except:
#value not in doc
pass
# except AttributeError:
# print("ERROR NO language_iso2")
# document.status("NGRAMS", error="No lang detected skipped Ngrams")
# corpus.skipped_docs.append(document.id)
# integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
......@@ -97,6 +95,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0:
corpus.status('Ngrams', progress=documents_count+1)
corpus.save_hyperdata()
session.add(corpus)
session.commit()
else:
# integrate ngrams and nodes-ngrams
......
from gargantext.util.db import *
from gargantext.models import *
from gargantext.constants import *
from gargantext.util.parsers import *
#from gargantext.util.parsers import *
from collections import defaultdict, Counter
from re import sub
......@@ -11,13 +11,11 @@ def parse(corpus):
corpus.status('Docs', progress=0)
#1 corpus => 1 resource
resources = corpus.resources()
#get the sources capabilities for a given corpus
#print(resource)
#get the sources capabilities for a given corpus resource
sources = [get_resource(resource["type"]) for resource in corpus.resources() if resource["extracted"] is False]
print(sources)
if len(sources) == 0:
#>>> documents have already been parsed?????
pass
return
if len(sources) > 0:
#>>> necessairement 1 corpus = 1 source dans l'archi actuelle
source = sources[0]
......@@ -27,25 +25,24 @@ def parse(corpus):
#corpus.status(error)
raise ValueError("Resource '%s' has no Parser" %resource["name"])
else:
#observed langages in corpus docs
corpus.languages = defaultdict.fromkeys(source["default_languages"], 0)
skipped_languages = []
#remember the skipped docs in parsing
corpus.skipped_docs = []
session.add(corpus)
session.commit()
#load the corresponding parser
parserbot = load_parser(source)
skipped_languages = []
# extract and insert documents from resource.path into database
#print(resource["path"])
for hyperdata in parserbot(resource["path"]):
# indexed text fields defined in constants
# indexed text fields defined in CONSTANTS
for k in DEFAULT_INDEX_FIELDS:
if k in hyperdata.keys():
try:
hyperdata[k] = normalize_chars(hyperdata[k])
except Exception as error :
hyperdata["error"] = "Error normalize_chars"
# a simple census to raise language info at corpus level
if "language_iso2" in hyperdata.keys():
try:
......@@ -54,33 +51,43 @@ def parse(corpus):
print("KeyError", hyperdata["language_iso2"])
hyperdata["error"] = "Error: unsupported language"
skipped_languages.append(hyperdata["language_iso2"])
elif "language_iso3" in hyperdata.keys():
elif "language_fullname" in hyperdata.keys():
try:
lang = language[hyperdata["language_iso3"]]
#full => iso2
lang = languages[hyperdata["language_fullname"]].name.lower()
corpus.languages[lang] += 1
except KeyError:
print("KeyError", lang)
print("KeyError", hyperdata["language_fullname"])
hyperdata["error"] = "Error: unsupported language"
skipped_languages.append(hyperdata["language_iso2"])
skipped_languages.append(lang)
else:
raise ValueError("PARSING ERROR: No lang detected")
# save as DB child
# ----------------
document = corpus.add_child(
typename = 'DOCUMENT',
name = hyperdata.get('title', '')[:255],
hyperdata = hyperdata,
)
session.add(document)
if "error" in hyperdata.keys():
#document.status("error")
print(hyperdata["error"])
document.status('Parsing', error= document.hyperdata["error"])
document.save_hyperdata()
session.commit()
corpus.skipped_docs.append(document.id)
pass
#no language have been indexed
#detectlang by index_fields
# for k in DEFAULT_INDEX_FIELDS:
# if k in hyperdata.keys():
# try:
# hyperdata["language_iso2"] = langdetect(hyperdata[k])
# except Exception as error :
# pass
#print(hyperdata.keys())
# save as DB child
# ----------------
document = corpus.add_child(
typename = 'DOCUMENT',
name = hyperdata.get('title', '')[:255],
hyperdata = hyperdata,
)
session.add(document)
if "error" in hyperdata.keys():
#document.status("error")
document.status('Parsing', error= document.hyperdata["error"])
document.save_hyperdata()
session.commit()
#adding skipped_docs for later processsing
corpus.skipped_docs.append(document.id)
documents_count += 1
# logging
if documents_count % BATCH_PARSING_SIZE == 0:
......@@ -92,11 +99,13 @@ def parse(corpus):
documents_count += 1
# update info about the resource
resource['extracted'] = True
# add a corpus-level info about languages adding a __skipped__ info
corpus.languages['__skipped__'] = Counter(skipped_languages)
for n in corpus.languages.items():
print(n)
# commit all changes
corpus.status('Docs', progress=documents_count, complete=True)
corpus.save_hyperdata()
......
......@@ -37,7 +37,7 @@ def docs_by_titles(request, project_id, corpus_id):
'date': datetime.now(),
'project': project,
'corpus': corpus,
'resourcename' : resourcename(corpus),
'resourcename' : get_resource_by_name(corpus.resources()[0]),
'view': 'titles',
'user': request.user
},
......@@ -65,7 +65,7 @@ def docs_by_journals(request, project_id, corpus_id):
'date': datetime.now(),
'project': project,
'corpus' : corpus,
'resourcename' : resourcename(corpus),
'resourcename' : get_resource_by_name(corpus.resources()[0]),
'view': 'journals'
},
)
......@@ -84,11 +84,8 @@ def analytics(request, project_id, corpus_id):
'date': datetime.now(),
'project': project,
'corpus': corpus,
'resourcename' : resourcename(corpus),
'resourcename' : get_resource_by_name(corpus.resources()[0]),
'view': 'analytics',
'user': request.user
},
)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment