Commit 5499546e authored by delanoe's avatar delanoe

[FIX] merge.

parents f64176b7 818c54d5
...@@ -172,13 +172,22 @@ DEFAULT_TFIDF_CUTOFF_RATIO = .45 # MAINLIST maximum terms in % ...@@ -172,13 +172,22 @@ DEFAULT_TFIDF_CUTOFF_RATIO = .45 # MAINLIST maximum terms in %
DEFAULT_TFIDF_HARD_LIMIT = 750 # MAINLIST maximum terms abs DEFAULT_TFIDF_HARD_LIMIT = 750 # MAINLIST maximum terms abs
# (makes COOCS larger ~ O(N²) /!\) # (makes COOCS larger ~ O(N²) /!\)
DEFAULT_COOC_THRESHOLD = 5 # inclusive minimum for COOCS coefs DEFAULT_COOC_THRESHOLD = 3 # inclusive minimum for COOCS coefs
# (makes COOCS more sparse) # (makes COOCS more sparse)
DEFAULT_MAPLIST_MAX = 300 # MAPLIST maximum terms DEFAULT_MAPLIST_MAX = 300 # MAPLIST maximum terms
DEFAULT_MAPLIST_MONOGRAMS_RATIO = .5 # part of monograms in MAPLIST DEFAULT_MAPLIST_MONOGRAMS_RATIO = .5 # part of monograms in MAPLIST
# (NB: used to be 0.005 !!)
DEFAULT_MAX_NGRAM_LEN = 7 # limit used after POStagging rule
# (initial ngrams number is a power law of this /!\)
# (and most longer ngrams have tiny freq anyway)
DEFAULT_ALL_LOWERCASE_FLAG = True # lowercase ngrams before recording
# them to their DB table
# (potentially bad for acronyms but
# good for variants like same term
#  occurring at sentence beginning)
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
......
from gargantext.util.languages import languages from gargantext.util.languages import languages
from gargantext.constants import LANGUAGES from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN
import nltk import nltk
import re import re
...@@ -17,7 +17,7 @@ class NgramsExtractor: ...@@ -17,7 +17,7 @@ class NgramsExtractor:
""" """
return re.sub(r'<[^>]{0,45}>', '', text) return re.sub(r'<[^>]{0,45}>', '', text)
def extract(self, text, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', label='NP'): def extract(self, text, rule='{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}', label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
text = self.clean_text(text) text = self.clean_text(text)
grammar = nltk.RegexpParser(label + ': ' + rule) grammar = nltk.RegexpParser(label + ': ' + rule)
tagged_tokens = list(self._tagger.tag_text(text)) tagged_tokens = list(self._tagger.tag_text(text))
...@@ -25,7 +25,9 @@ class NgramsExtractor: ...@@ -25,7 +25,9 @@ class NgramsExtractor:
grammar_parsed = grammar.parse(tagged_tokens) grammar_parsed = grammar.parse(tagged_tokens)
for subtree in grammar_parsed.subtrees(): for subtree in grammar_parsed.subtrees():
if subtree.label() == label: if subtree.label() == label:
yield subtree.leaves() if len(subtree) < max_n_words:
yield subtree.leaves()
# ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
class NgramsExtractors(dict): class NgramsExtractors(dict):
......
...@@ -15,9 +15,9 @@ class identity_dict(dict): ...@@ -15,9 +15,9 @@ class identity_dict(dict):
_tag_replacements = identity_dict({ _tag_replacements = identity_dict({
"NOM": "NN", "NOM": "NN",
"NAM": "NN", "NAM": "NN",
"ADJ": "NN", "ADJ": "JJ",
"VER": "JJ", "VER": "VB",
"PREP": "PRP", "PREP": "IN",
"KON": "CC", "KON": "CC",
"DET": "DT", "DET": "DT",
"PRO": "DT", "PRO": "DT",
......
...@@ -14,8 +14,14 @@ class TurboTagger: ...@@ -14,8 +14,14 @@ class TurboTagger:
def tag_text(self, text): def tag_text(self, text):
if not hasattr(self, '_nlpclient'): if not hasattr(self, '_nlpclient'):
self._nlpclient = NLPClient() self._nlpclient = NLPClient()
tokens_tags = [] try:
for sentence in self._nlpclient.tag(text): tokens_tags = []
for token, tag in sentence: for sentence in self._nlpclient.tag(text):
tokens_tags.append((token, tag, )) for token, tag in sentence:
return tokens_tags tokens_tags.append((token, tag, ))
return tokens_tags
except ConnectionRefusedError as e:
print(e)
print("TurboTagger: problem with the NLPServer (try running gargantext/parsing/Taggers/lib/nlpserver/server.py)")
# TODO abort workflow?
return []
...@@ -105,6 +105,8 @@ def parse_extract_indexhyperdata(corpus): ...@@ -105,6 +105,8 @@ def parse_extract_indexhyperdata(corpus):
grouplist_id=group_id) grouplist_id=group_id)
print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id)) print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
print('CORPUS #%d: [%s] FINISHED ngram lists computation' % (corpus.id, t()))
def t(): def t():
return datetime.now().strftime("%Y-%m-%d_%H:%M:%S") return datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
...@@ -81,16 +81,20 @@ def do_maplist(corpus, ...@@ -81,16 +81,20 @@ def do_maplist(corpus,
.limit(multigrams_limit) .limit(multigrams_limit)
.all() .all()
) )
obtained_mono = len(top_monograms)
print("MAPLIST: top_monograms =", len(top_monograms)) obtained_multi = len(top_multigrams)
print("MAPLIST: top_multigrams = ", len(top_multigrams)) obtained_total = obtained_mono + obtained_multi
# print("MAPLIST: top_monograms =", obtained_mono)
# print("MAPLIST: top_multigrams = ", obtained_multi)
print("MAPLIST: kept %i ngrams in total " % obtained_total)
# NEW MAPLIST NODE # NEW MAPLIST NODE
# ----------------- # -----------------
# saving the parameters of the analysis in the Node JSON # saving the parameters of the analysis in the Node JSON
new_hyperdata = { 'corpus': corpus.id, new_hyperdata = { 'corpus': corpus.id,
'limit' : limit, 'limit' : limit,
'monograms_part' : monograms_part 'monograms_part' : monograms_part,
'monograms_result' : obtained_mono/obtained_total
} }
if overwrite_id: if overwrite_id:
# overwrite pre-existing node # overwrite pre-existing node
......
...@@ -4,6 +4,7 @@ from gargantext.constants import * ...@@ -4,6 +4,7 @@ from gargantext.constants import *
from gargantext.util.ngramsextractors import ngramsextractors from gargantext.util.ngramsextractors import ngramsextractors
from collections import defaultdict from collections import defaultdict
from re import sub
from gargantext.util.scheduling import scheduled from gargantext.util.scheduling import scheduled
...@@ -32,7 +33,7 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor): ...@@ -32,7 +33,7 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
db.commit() db.commit()
def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstract', )): def extract_ngrams(corpus, keys=('title', 'abstract', )):
"""Extract ngrams for every document below the given corpus. """Extract ngrams for every document below the given corpus.
Default language is given by the resource type. Default language is given by the resource type.
The result is then inserted into database. The result is then inserted into database.
...@@ -62,7 +63,7 @@ def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstr ...@@ -62,7 +63,7 @@ def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstr
# get ngrams # get ngrams
for ngram in ngramsextractor.extract(value): for ngram in ngramsextractor.extract(value):
tokens = tuple(token[0] for token in ngram) tokens = tuple(token[0] for token in ngram)
terms = ' '.join(tokens) terms = normalize_terms(' '.join(tokens))
nodes_ngrams_count[(document.id, terms)] += 1 nodes_ngrams_count[(document.id, terms)] += 1
ngrams_data.add((terms[:255], len(tokens), )) ngrams_data.add((terms[:255], len(tokens), ))
# integrate ngrams and nodes-ngrams # integrate ngrams and nodes-ngrams
...@@ -84,3 +85,21 @@ def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstr ...@@ -84,3 +85,21 @@ def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstr
corpus.save_hyperdata() corpus.save_hyperdata()
session.commit() session.commit()
raise error raise error
def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
"""
Removes unwanted trailing punctuation
AND optionally puts everything to lowercase
ex /'ecosystem services'/ => /ecosystem services/
(benefits from normalize_chars upstream so there's less cases to consider)
"""
term_str = sub(r'^[-",;/%(){}\\\[\]\.\']+', '', term_str)
term_str = sub(r'[-",;/%(){}\\\[\]\.\']+$', '', term_str)
if do_lowercase:
term_str = term_str.lower()
return term_str
...@@ -3,6 +3,7 @@ from gargantext.models import * ...@@ -3,6 +3,7 @@ from gargantext.models import *
from gargantext.constants import * from gargantext.constants import *
from collections import defaultdict from collections import defaultdict
from re import sub
def parse(corpus): def parse(corpus):
try: try:
...@@ -21,14 +22,26 @@ def parse(corpus): ...@@ -21,14 +22,26 @@ def parse(corpus):
resource_path = resource['path'] resource_path = resource['path']
# extract and insert documents from corpus resource into database # extract and insert documents from corpus resource into database
for hyperdata in resource_parser(resource_path): for hyperdata in resource_parser(resource_path):
# uniformize the text values for easier POStagging and processing
for k in ['abstract', 'title']:
if k in hyperdata:
hyperdata[k] = normalize_chars(hyperdata[k])
# save as DB child
# ----------------
document = corpus.add_child( document = corpus.add_child(
typename = 'DOCUMENT', typename = 'DOCUMENT',
name = hyperdata.get('title', '')[:255], name = hyperdata.get('title', '')[:255],
hyperdata = hyperdata, hyperdata = hyperdata,
) )
session.add(document) session.add(document)
# a simple census to raise language info at corpus level
if "language_iso2" in hyperdata: if "language_iso2" in hyperdata:
observed_languages[hyperdata["language_iso2"]] += 1 observed_languages[hyperdata["language_iso2"]] += 1
# logging
if documents_count % BATCH_PARSING_SIZE == 0: if documents_count % BATCH_PARSING_SIZE == 0:
corpus.status('parsing', progress=documents_count) corpus.status('parsing', progress=documents_count)
corpus.save_hyperdata() corpus.save_hyperdata()
...@@ -47,3 +60,116 @@ def parse(corpus): ...@@ -47,3 +60,116 @@ def parse(corpus):
corpus.save_hyperdata() corpus.save_hyperdata()
session.commit() session.commit()
raise error raise error
def normalize_chars(my_str):
"""
Simplification des chaînes de caractères en entrée de la BDD
("les caractères qu'on voudrait ne jamais voir")
- normalisation
> espaces
> tirets
> guillemets
- déligatures
NB: cette normalisation du texte en entrée ne va pas enlever les ponctuations
mais les transcoder en une forme plus canonique pour réduire leur diversité
(autres traitements plus invasifs, comme enlever les guillemets
ou passer en lowercase, seront à placer plutôt *après* le tagger,
cf. toolchain.ngrams_extraction.normalize_terms)
"""
# --------------
# E S P A C E S
# --------------
# tous les caractères de contrôle (dont \t = \x{0009}, \n = \x{000A} et \r = \x{000D}) --> espace
my_str = sub(r'[\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F\u007F]', ' ', my_str)
# Line separator
my_str = sub(r'\u2028',' ', my_str)
my_str = sub(r'\u2029',' ', my_str)
# U+0092: parfois quote parfois cara de contrôle
my_str = sub(r'\u0092', ' ', my_str)
# tous les espaces alternatifs --> espace
my_str = sub(r'[\u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u3000\uFEFF]', ' ' , my_str)
# pour finir on enlève les espaces en trop
# (dits "trailing spaces")
my_str = sub(r'\s+', ' ', my_str)
my_str = sub(r'^\s', '', my_str)
my_str = sub(r'\s$', '', my_str)
# ------------------------
# P O N C T U A T I O N S
# ------------------------
# la plupart des tirets alternatifs --> tiret normal (dit "du 6")
# (dans l'ordre U+002D U+2010 U+2011 U+2012 U+2013 U+2014 U+2015 U+2212 U+FE63)
my_str = sub(r'[‐‑‒–—―−﹣]','-', my_str)
# le macron aussi parfois comme tiret
my_str = sub(r'\u00af','-', my_str)
# Guillemets
# ----------
# la plupart des quotes simples --> ' APOSTROPHE
my_str = sub(r"‘’‚`‛", "'", my_str) # U+2018 U+2019 U+201a U+201b
my_str = sub(r'‹ ?',"'", my_str) # U+2039 plus espace éventuel après
my_str = sub(r' ?›',"'", my_str) # U+203A plus espace éventuel avant
# la plupart des quotes doubles --> " QUOTATION MARK
my_str = sub(r'“”„‟', '"', my_str) # U+201C U+201D U+201E U+201F
my_str = sub(r'« ?', '"', my_str) # U+20AB plus espace éventuel après
my_str = sub(r' ?»', '"', my_str) # U+20AB plus espace éventuel avant
# deux quotes simples (préparées ci-dessus) => une double
my_str = sub(r"''", '"', my_str)
# Autres
# -------
my_str = sub(r'…', '...', my_str)
# paragraph separator utilisé parfois comme '...'
my_str = sub(r'\u0085', '...', my_str)
my_str = sub(r'€', 'EUR', my_str)
# quelques puces courantes (bullets)
my_str = sub(r'▪', '*', my_str)
my_str = sub(r'►', '*', my_str)
my_str = sub(r'●', '*', my_str)
my_str = sub(r'◘', '*', my_str)
my_str = sub(r'→', '*', my_str)
my_str = sub(r'•', '*', my_str)
my_str = sub(r'·', '*', my_str)
# ------------------
# L I G A T U R E S
# ------------------
my_str = sub(r'Ꜳ', 'AA', my_str)
my_str = sub(r'ꜳ', 'aa', my_str)
my_str = sub(r'Æ', 'AE', my_str)
my_str = sub(r'æ', 'ae', my_str)
my_str = sub(r'DZ', 'DZ', my_str)
my_str = sub(r'Dz', 'Dz', my_str)
my_str = sub(r'dz', 'dz', my_str)
my_str = sub(r'ffi', 'ffi', my_str)
my_str = sub(r'ff', 'ff', my_str)
my_str = sub(r'fi', 'fi', my_str)
my_str = sub(r'ffl', 'ffl', my_str)
my_str = sub(r'fl', 'fl', my_str)
my_str = sub(r'ſt', 'ft', my_str)
my_str = sub(r'IJ', 'IJ', my_str)
my_str = sub(r'ij', 'ij', my_str)
my_str = sub(r'LJ', 'LJ', my_str)
my_str = sub(r'lj', 'lj', my_str)
my_str = sub(r'NJ', 'NJ', my_str)
my_str = sub(r'nj', 'nj', my_str)
my_str = sub(r'Œ', 'OE', my_str)
my_str = sub(r'œ', 'oe', my_str)
my_str = sub(r'\u009C', 'oe', my_str) # U+009C (cara contrôle vu comme oe)
my_str = sub(r'st', 'st', my_str)
my_str = sub(r'Ꜩ', 'Tz', my_str)
my_str = sub(r'ꜩ', 'tz', my_str)
return my_str
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment