Commit 24b52471 authored by delanoe's avatar delanoe

Merge remote-tracking branch 'origin/romain-testing' into testing

parents d1c00c07 309a940e
......@@ -65,12 +65,12 @@ class EuropresseParser(Parser):
try:
html_parser = html5parser.etree.HTMLParser(encoding=codif)
html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article')
except Exception as error:
html_articles = None
print ("Europresse lxml error:", error)
print ("Europresse lxml error:", str(error))
# all except detail_header are mandatory to parse the article
name_xpath = "./header/div/span[@class = 'DocPublicationName']"
......@@ -113,7 +113,10 @@ class EuropresseParser(Parser):
# parse all the articles, one by one
if html_articles is not None:
if html_articles is None:
filename = file.name if hasattr(file, 'name') else 'unknown file'
print("WARNING: europresse (skip) 1 file with no parsable content: " + filename)
else:
for html_article in html_articles:
try:
# s'il n'y a pas du tout de header on doit skip
......
......@@ -98,9 +98,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
tokens = tuple(normalize_forms(token[0]) for token in ngram)
if do_subngrams:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],
# ['very', 'cool', 'exemple'],
# ['cool', 'exemple']]
# subterms = [['very', 'cool'],...]
subterms = subsequences(tokens)
else:
......@@ -108,11 +106,16 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
for seqterm in subterms:
ngram = ' '.join(seqterm)
if len(ngram) > 1:
nbwords = len(seqterm)
nbchars = len(ngram)
if nbchars > 1:
if nbchars > 255:
# max ngram length (DB constraint)
ngram = ngram[:255]
# doc <=> ngram index
nodes_ngrams_count[(document.id, ngram)] += 1
# add fields : terms n
ngrams_data.add((ngram[:255], len(seqterm), ))
ngrams_data.add((ngram, nbwords, ))
except:
#value not in doc
continue
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment