Commit 49552ff6 authored by c24b's avatar c24b

PARSING with default_languages and skipped_docs added

parent cd453144
...@@ -40,26 +40,31 @@ class CernParser(Parser): ...@@ -40,26 +40,31 @@ class CernParser(Parser):
"856": {"u":"pdf_source"}, "856": {"u":"pdf_source"},
} }
def format_date(self, hyperdata): # def format_date(self, hyperdata):
'''formatting pubdate''' # '''formatting pubdate'''
prefix = "publication" # prefix = "publication"
date = datetime.strptime(hyperdata[prefix + "_date"], "%Y-%m-%d") # try:
#hyperdata[prefix + "_year"] = date.strftime('%Y') # date = datetime.strptime(hyperdata[prefix + "_date"], "%Y-%m-%d")
hyperdata[prefix + "_month"] = date.strftime("%m") # except ValueError:
hyperdata[prefix + "_day"] = date.strftime("%d") # date = datetime.strptime(hyperdata[prefix + "_date"], "%Y-%m")
hyperdata[prefix + "_hour"] = date.strftime("%H") # date.day = "01"
hyperdata[prefix + "_minute"] = date.strftime("%M") # hyperdata[prefix + "_year"] = date.strftime('%Y')
hyperdata[prefix + "_second"] = date.strftime("%S") # hyperdata[prefix + "_month"] = date.strftime("%m")
hyperdata[prefix + "_date"] = date.strftime("%Y-%m-%d %H:%M:%S") # hyperdata[prefix + "_day"] = date.strftime("%d")
print("Date", hyperdata["publication_date"]) #
return hyperdata # hyperdata[prefix + "_hour"] = date.strftime("%H")
# hyperdata[prefix + "_minute"] = date.strftime("%M")
# hyperdata[prefix + "_second"] = date.strftime("%S")
# hyperdata[prefix + "_date"] = date.strftime("%Y-%m-%d %H:%M:%S")
# #print("Date", hyperdata["publication_date"])
# return hyperdata
#@asyncio.coroutine #@asyncio.coroutine
def parse(self, file): def parse(self, file):
print("PARSING") #print("PARSING")
hyperdata_list = [] hyperdata_list = []
doc = file.read() doc = file.read()
print(doc[:35]) #print(doc[:35])
soup = BeautifulSoup(doc, "lxml") soup = BeautifulSoup(doc, "lxml")
#print(soup.find("record")) #print(soup.find("record"))
...@@ -93,8 +98,8 @@ class CernParser(Parser): ...@@ -93,8 +98,8 @@ class CernParser(Parser):
hyperdata["authors_affiliations"] = (",").join(hyperdata["authors_affiliations"]) hyperdata["authors_affiliations"] = (",").join(hyperdata["authors_affiliations"])
hyperdata["authors"] = (",").join(hyperdata["authors"]) hyperdata["authors"] = (",").join(hyperdata["authors"])
hyperdata["authors_mails"] = (",").join(hyperdata["authors_mails"]) hyperdata["authors_mails"] = (",").join(hyperdata["authors_mails"])
hyperdata = self.format_date(hyperdata) #hyperdata = self.format_date(hyperdata)
hyperdata = self.format_hyperdata_languages(hyperdata)
hyperdata = self.format_hyperdata_dates(hyperdata)
hyperdata_list.append(hyperdata) hyperdata_list.append(hyperdata)
return hyperdata_list return hyperdata_list
...@@ -9,7 +9,7 @@ import nltk ...@@ -9,7 +9,7 @@ import nltk
class Tagger: class Tagger:
def __init__(self, text): def __init__(self):
# This regular expression is really good at tokenizing a text! # This regular expression is really good at tokenizing a text!
self._re_sentence = re.compile(r'''(?x) # set flag to allow verbose regexps self._re_sentence = re.compile(r'''(?x) # set flag to allow verbose regexps
(?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A. (?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
...@@ -19,18 +19,18 @@ class Tagger: ...@@ -19,18 +19,18 @@ class Tagger:
| [][.,;"'?!():-_`] # these are separate tokens | [][.,;"'?!():-_`] # these are separate tokens
''', re.UNICODE | re.MULTILINE | re.DOTALL) ''', re.UNICODE | re.MULTILINE | re.DOTALL)
self.buffer = [] self.buffer = []
self.text = clean_text(text)
self.start()
#self.start()
def clean_text(text):
def clean_text(self, text):
"""Clean the text for better POS tagging. """Clean the text for better POS tagging.
For now, only removes (short) XML tags. For now, only removes (short) XML tags.
""" """
return re.sub(r'<[^>]{0,45}>', '', text) return re.sub(r'<[^>]{0,45}>', '', text)
def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN): def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
text = self.clean_text(text) self.text = self.clean_text(text)
grammar = nltk.RegexpParser(label + ': ' + rule) grammar = nltk.RegexpParser(label + ': ' + rule)
tagged_tokens = list(self.tag_text(self.text)) tagged_tokens = list(self.tag_text(self.text))
if len(tagged_tokens): if len(tagged_tokens):
......
...@@ -82,6 +82,7 @@ def parse_extract_indexhyperdata(corpus): ...@@ -82,6 +82,7 @@ def parse_extract_indexhyperdata(corpus):
favs = corpus.add_child( favs = corpus.add_child(
typename='FAVORITES', name='favorite docs in "%s"' % corpus.name typename='FAVORITES', name='favorite docs in "%s"' % corpus.name
) )
session.add(favs) session.add(favs)
session.commit() session.commit()
print('CORPUS #%d: [%s] new favorites node #%i' % (corpus.id, t(), favs.id)) print('CORPUS #%d: [%s] new favorites node #%i' % (corpus.id, t(), favs.id))
...@@ -265,7 +266,7 @@ def recount(corpus): ...@@ -265,7 +266,7 @@ def recount(corpus):
# -> specclusion/genclusion: compute + write (=> NodeNodeNgram) # -> specclusion/genclusion: compute + write (=> NodeNodeNgram)
(spec_id, gen_id) = compute_specgen(corpus, cooc_matrix = coocs, (spec_id, gen_id) = compute_specgen(corpus, cooc_matrix = coocs,
spec_overwrite_id = old_spec_id, spec_overwrite_id = old_spec_id,
gen_overwrite_id = old_gen_id) gen_overwrite_id = old_gen_id)
print('RECOUNT #%d: [%s] updated spec-clusion node #%i' % (corpus.id, t(), spec_id)) print('RECOUNT #%d: [%s] updated spec-clusion node #%i' % (corpus.id, t(), spec_id))
......
...@@ -22,17 +22,13 @@ def prepare_stemmers(corpus): ...@@ -22,17 +22,13 @@ def prepare_stemmers(corpus):
""" """
Returns *several* stemmers (one for each language in the corpus) Returns *several* stemmers (one for each language in the corpus)
(as a dict of stemmers with key = language_iso2) (as a dict of stemmers with key = language_iso2)
languages has been previously filtered by supported source languages
and formatted
""" """
stemmers_by_lg = { stemmers = {lang:SnowballStemmer(languages[lang].name.lower()) for lang \
# always get a generic stemmer in case language code unknown in corpus.languages.keys() if lang !="__skipped__"}
'__unknown__' : SnowballStemmer("english") stemmers['__unknown__'] = SnowballStemmer("english")
} return stemmers
for lang in corpus.languages.keys():
print(lang)
if (lang != '__skipped__'):
lgname = languages[lang].name.lower()
stemmers_by_lg[lang] = SnowballStemmer(lgname)
return stemmers_by_lg
def compute_groups(corpus, stoplist_id = None, overwrite_id = None): def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
""" """
...@@ -40,7 +36,6 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None): ...@@ -40,7 +36,6 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
2) Create an empty GROUPLIST node (for a list of "synonym" ngrams) 2) Create an empty GROUPLIST node (for a list of "synonym" ngrams)
3) Save the list to DB (list node + each grouping as listnode - ngram1 - ngram2) 3) Save the list to DB (list node + each grouping as listnode - ngram1 - ngram2)
""" """
print(corpus.languages.keys())
stop_ngrams_ids = {} stop_ngrams_ids = {}
# we will need the ngrams of the stoplist to filter # we will need the ngrams of the stoplist to filter
...@@ -60,16 +55,17 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None): ...@@ -60,16 +55,17 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
my_groups = defaultdict(Counter) my_groups = defaultdict(Counter)
# preloop per doc to sort ngrams by language # preloop per doc to sort ngrams by language
for doc in corpus.children(): for doc in corpus.children('DOCUMENT'):
if ('language_iso2' in doc.hyperdata): if doc.id not in corpus.skipped_docs:
lgid = doc.hyperdata['language_iso2'] if ('language_iso2' in doc.hyperdata):
else: lgid = doc.hyperdata['language_iso2']
lgid = "__unknown__" else:
lgid = "__unknown__"
# doc.ngrams is an sql query (ugly but useful intermediate step)
# FIXME: move the counting and stoplist filtering up here # doc.ngrams is an sql query (ugly but useful intermediate step)
for ngram_pack in doc.ngrams.all(): # FIXME: move the counting and stoplist filtering up here
todo_ngrams_per_lg[lgid].add(ngram_pack) for ngram_pack in doc.ngrams.all():
todo_ngrams_per_lg[lgid].add(ngram_pack)
# -------------------- # --------------------
# long loop per ngrams # long loop per ngrams
......
...@@ -3,7 +3,6 @@ from gargantext.models import * ...@@ -3,7 +3,6 @@ from gargantext.models import *
from gargantext.constants import * from gargantext.constants import *
from collections import defaultdict from collections import defaultdict
from re import sub from re import sub
from gargantext.util.scheduling import scheduled from gargantext.util.scheduling import scheduled
def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor): def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
...@@ -44,51 +43,50 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND ...@@ -44,51 +43,50 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
db, cursor = get_cursor() db, cursor = get_cursor()
nodes_ngrams_count = defaultdict(int) nodes_ngrams_count = defaultdict(int)
ngrams_data = set() ngrams_data = set()
# extract ngrams #1 corpus = 1 resource
resource = corpus.resources()[0] resource = corpus.resources()[0]
source = get_resource(resource["type"])
documents_count = 0 documents_count = 0
source = get_resource(resource["type"])
#load available taggers for source default langage #load available taggers for source default langage
#skipped documents that have been skipped previously for parsing error or unsupported language
tagger_bots = {lang: load_tagger(lang) for lang in corpus.languages if lang != "__skipped__"}
docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.skipped_docs] docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.skipped_docs]
tagger_bots = {lang: load_tagger(lang)() for lang in corpus.languages if lang != "__skipped__"}
#sort docs by lang? #sort docs by lang?
for documents_count, document in enumerate(docs): for lang, tagger in tagger_bots.items():
try: for documents_count, document in enumerate(docs):
lang_doc = document.hyperdata["language_iso2"] language_iso2 = document.hyperdata.get('language_iso2', lang)
except AttributeError: #print(language_iso2)
print("NO LANG DETECTED") for key in keys:
document.status("NGRAMS", error="No lang detected?") try:
corpus.skipped_docs.append(document.id) value = document[str(key)]
continue if not isinstance(value, str):
for key in keys: continue
value = document.get(key, None) # get ngrams
print("VAL", value) for ngram in tagger.extract(value):
if not isinstance(value, str): tokens = tuple(normalize_forms(token[0]) for token in ngram)
continue if do_subngrams:
# get ngrams # ex tokens = ["very", "cool", "exemple"]
for ngram in tagger_bots[lang_doc](value): # subterms = [['very', 'cool'],
tokens = tuple(normalize_forms(token[0]) for token in ngram) # ['very', 'cool', 'exemple'],
print("tk", tokens) # ['cool', 'exemple']]
if do_subngrams:
# ex tokens = ["very", "cool", "exemple"] subterms = subsequences(tokens)
# subterms = [['very', 'cool'], else:
# ['very', 'cool', 'exemple'], subterms = [tokens]
# ['cool', 'exemple']]
for seqterm in subterms:
subterms = subsequences(tokens) ngram = ' '.join(seqterm)
else: if len(ngram) > 1:
subterms = [tokens] # doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1
for seqterm in subterms: # add fields : terms n
ngram = ' '.join(seqterm) ngrams_data.add((ngram[:255], len(seqterm), ))
if len(ngram) > 1: except:
# doc <=> ngram index #value not in doc
nodes_ngrams_count[(document.id, ngram)] += 1 pass
# add fields : terms n # except AttributeError:
ngrams_data.add((ngram[:255], len(seqterm), )) # print("ERROR NO language_iso2")
# document.status("NGRAMS", error="No lang detected skipped Ngrams")
# corpus.skipped_docs.append(document.id)
# integrate ngrams and nodes-ngrams # integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE: if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor) _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
...@@ -97,6 +95,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND ...@@ -97,6 +95,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0: if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0:
corpus.status('Ngrams', progress=documents_count+1) corpus.status('Ngrams', progress=documents_count+1)
corpus.save_hyperdata() corpus.save_hyperdata()
session.add(corpus)
session.commit() session.commit()
else: else:
# integrate ngrams and nodes-ngrams # integrate ngrams and nodes-ngrams
......
from gargantext.util.db import * from gargantext.util.db import *
from gargantext.models import * from gargantext.models import *
from gargantext.constants import * from gargantext.constants import *
from gargantext.util.parsers import * #from gargantext.util.parsers import *
from collections import defaultdict, Counter from collections import defaultdict, Counter
from re import sub from re import sub
...@@ -11,13 +11,11 @@ def parse(corpus): ...@@ -11,13 +11,11 @@ def parse(corpus):
corpus.status('Docs', progress=0) corpus.status('Docs', progress=0)
#1 corpus => 1 resource #1 corpus => 1 resource
resources = corpus.resources() resources = corpus.resources()
#get the sources capabilities for a given corpus #get the sources capabilities for a given corpus resource
#print(resource)
sources = [get_resource(resource["type"]) for resource in corpus.resources() if resource["extracted"] is False] sources = [get_resource(resource["type"]) for resource in corpus.resources() if resource["extracted"] is False]
print(sources)
if len(sources) == 0: if len(sources) == 0:
#>>> documents have already been parsed????? #>>> documents have already been parsed?????
pass return
if len(sources) > 0: if len(sources) > 0:
#>>> necessairement 1 corpus = 1 source dans l'archi actuelle #>>> necessairement 1 corpus = 1 source dans l'archi actuelle
source = sources[0] source = sources[0]
...@@ -27,25 +25,24 @@ def parse(corpus): ...@@ -27,25 +25,24 @@ def parse(corpus):
#corpus.status(error) #corpus.status(error)
raise ValueError("Resource '%s' has no Parser" %resource["name"]) raise ValueError("Resource '%s' has no Parser" %resource["name"])
else: else:
#observed langages in corpus docs
corpus.languages = defaultdict.fromkeys(source["default_languages"], 0) corpus.languages = defaultdict.fromkeys(source["default_languages"], 0)
skipped_languages = []
#remember the skipped docs in parsing
corpus.skipped_docs = [] corpus.skipped_docs = []
session.add(corpus) session.add(corpus)
session.commit() session.commit()
#load the corresponding parser #load the corresponding parser
parserbot = load_parser(source) parserbot = load_parser(source)
skipped_languages = []
# extract and insert documents from resource.path into database # extract and insert documents from resource.path into database
#print(resource["path"])
for hyperdata in parserbot(resource["path"]): for hyperdata in parserbot(resource["path"]):
# indexed text fields defined in constants # indexed text fields defined in CONSTANTS
for k in DEFAULT_INDEX_FIELDS: for k in DEFAULT_INDEX_FIELDS:
if k in hyperdata.keys(): if k in hyperdata.keys():
try: try:
hyperdata[k] = normalize_chars(hyperdata[k]) hyperdata[k] = normalize_chars(hyperdata[k])
except Exception as error : except Exception as error :
hyperdata["error"] = "Error normalize_chars" hyperdata["error"] = "Error normalize_chars"
# a simple census to raise language info at corpus level # a simple census to raise language info at corpus level
if "language_iso2" in hyperdata.keys(): if "language_iso2" in hyperdata.keys():
try: try:
...@@ -54,33 +51,43 @@ def parse(corpus): ...@@ -54,33 +51,43 @@ def parse(corpus):
print("KeyError", hyperdata["language_iso2"]) print("KeyError", hyperdata["language_iso2"])
hyperdata["error"] = "Error: unsupported language" hyperdata["error"] = "Error: unsupported language"
skipped_languages.append(hyperdata["language_iso2"]) skipped_languages.append(hyperdata["language_iso2"])
elif "language_iso3" in hyperdata.keys(): elif "language_fullname" in hyperdata.keys():
try: try:
lang = language[hyperdata["language_iso3"]] #full => iso2
lang = languages[hyperdata["language_fullname"]].name.lower()
corpus.languages[lang] += 1 corpus.languages[lang] += 1
except KeyError: except KeyError:
print("KeyError", lang) print("KeyError", hyperdata["language_fullname"])
hyperdata["error"] = "Error: unsupported language" hyperdata["error"] = "Error: unsupported language"
skipped_languages.append(hyperdata["language_iso2"]) skipped_languages.append(lang)
else: else:
raise ValueError("PARSING ERROR: No lang detected") pass
#no language have been indexed
# save as DB child #detectlang by index_fields
# ---------------- # for k in DEFAULT_INDEX_FIELDS:
document = corpus.add_child( # if k in hyperdata.keys():
typename = 'DOCUMENT', # try:
name = hyperdata.get('title', '')[:255], # hyperdata["language_iso2"] = langdetect(hyperdata[k])
hyperdata = hyperdata, # except Exception as error :
) # pass
session.add(document) #print(hyperdata.keys())
# save as DB child
if "error" in hyperdata.keys(): # ----------------
#document.status("error") document = corpus.add_child(
print(hyperdata["error"]) typename = 'DOCUMENT',
document.status('Parsing', error= document.hyperdata["error"]) name = hyperdata.get('title', '')[:255],
document.save_hyperdata() hyperdata = hyperdata,
session.commit() )
corpus.skipped_docs.append(document.id) session.add(document)
if "error" in hyperdata.keys():
#document.status("error")
document.status('Parsing', error= document.hyperdata["error"])
document.save_hyperdata()
session.commit()
#adding skipped_docs for later processsing
corpus.skipped_docs.append(document.id)
documents_count += 1
# logging # logging
if documents_count % BATCH_PARSING_SIZE == 0: if documents_count % BATCH_PARSING_SIZE == 0:
...@@ -92,11 +99,13 @@ def parse(corpus): ...@@ -92,11 +99,13 @@ def parse(corpus):
documents_count += 1
# update info about the resource # update info about the resource
resource['extracted'] = True resource['extracted'] = True
# add a corpus-level info about languages adding a __skipped__ info # add a corpus-level info about languages adding a __skipped__ info
corpus.languages['__skipped__'] = Counter(skipped_languages) corpus.languages['__skipped__'] = Counter(skipped_languages)
for n in corpus.languages.items():
print(n)
# commit all changes # commit all changes
corpus.status('Docs', progress=documents_count, complete=True) corpus.status('Docs', progress=documents_count, complete=True)
corpus.save_hyperdata() corpus.save_hyperdata()
......
...@@ -37,7 +37,7 @@ def docs_by_titles(request, project_id, corpus_id): ...@@ -37,7 +37,7 @@ def docs_by_titles(request, project_id, corpus_id):
'date': datetime.now(), 'date': datetime.now(),
'project': project, 'project': project,
'corpus': corpus, 'corpus': corpus,
'resourcename' : resourcename(corpus), 'resourcename' : get_resource_by_name(corpus.resources()[0]),
'view': 'titles', 'view': 'titles',
'user': request.user 'user': request.user
}, },
...@@ -65,7 +65,7 @@ def docs_by_journals(request, project_id, corpus_id): ...@@ -65,7 +65,7 @@ def docs_by_journals(request, project_id, corpus_id):
'date': datetime.now(), 'date': datetime.now(),
'project': project, 'project': project,
'corpus' : corpus, 'corpus' : corpus,
'resourcename' : resourcename(corpus), 'resourcename' : get_resource_by_name(corpus.resources()[0]),
'view': 'journals' 'view': 'journals'
}, },
) )
...@@ -84,11 +84,8 @@ def analytics(request, project_id, corpus_id): ...@@ -84,11 +84,8 @@ def analytics(request, project_id, corpus_id):
'date': datetime.now(), 'date': datetime.now(),
'project': project, 'project': project,
'corpus': corpus, 'corpus': corpus,
'resourcename' : resourcename(corpus), 'resourcename' : get_resource_by_name(corpus.resources()[0]),
'view': 'analytics', 'view': 'analytics',
'user': request.user 'user': request.user
}, },
) )
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment