Commit 5499546e authored by delanoe's avatar delanoe

[FIX] merge.

parents f64176b7 818c54d5
......@@ -172,13 +172,22 @@ DEFAULT_TFIDF_CUTOFF_RATIO = .45 # MAINLIST maximum terms in %
DEFAULT_TFIDF_HARD_LIMIT = 750 # MAINLIST maximum terms abs
# (makes COOCS larger ~ O(N²) /!\)
DEFAULT_COOC_THRESHOLD = 5 # inclusive minimum for COOCS coefs
DEFAULT_COOC_THRESHOLD = 3 # inclusive minimum for COOCS coefs
# (makes COOCS more sparse)
DEFAULT_MAPLIST_MAX = 300 # MAPLIST maximum terms
DEFAULT_MAPLIST_MONOGRAMS_RATIO = .5 # part of monograms in MAPLIST
# (NB: used to be 0.005 !!)
DEFAULT_MAX_NGRAM_LEN = 7 # limit used after POStagging rule
# (initial ngrams number is a power law of this /!\)
# (and most longer ngrams have tiny freq anyway)
DEFAULT_ALL_LOWERCASE_FLAG = True # lowercase ngrams before recording
# them to their DB table
# (potentially bad for acronyms but
# good for variants like same term
#  occurring at sentence beginning)
# ------------------------------------------------------------------------------
......
from gargantext.util.languages import languages
from gargantext.constants import LANGUAGES
from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN
import nltk
import re
......@@ -17,7 +17,7 @@ class NgramsExtractor:
"""
return re.sub(r'<[^>]{0,45}>', '', text)
def extract(self, text, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', label='NP'):
def extract(self, text, rule='{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}', label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
text = self.clean_text(text)
grammar = nltk.RegexpParser(label + ': ' + rule)
tagged_tokens = list(self._tagger.tag_text(text))
......@@ -25,7 +25,9 @@ class NgramsExtractor:
grammar_parsed = grammar.parse(tagged_tokens)
for subtree in grammar_parsed.subtrees():
if subtree.label() == label:
yield subtree.leaves()
if len(subtree) < max_n_words:
yield subtree.leaves()
# ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
class NgramsExtractors(dict):
......
......@@ -15,9 +15,9 @@ class identity_dict(dict):
_tag_replacements = identity_dict({
"NOM": "NN",
"NAM": "NN",
"ADJ": "NN",
"VER": "JJ",
"PREP": "PRP",
"ADJ": "JJ",
"VER": "VB",
"PREP": "IN",
"KON": "CC",
"DET": "DT",
"PRO": "DT",
......
......@@ -14,8 +14,14 @@ class TurboTagger:
def tag_text(self, text):
if not hasattr(self, '_nlpclient'):
self._nlpclient = NLPClient()
tokens_tags = []
for sentence in self._nlpclient.tag(text):
for token, tag in sentence:
tokens_tags.append((token, tag, ))
return tokens_tags
try:
tokens_tags = []
for sentence in self._nlpclient.tag(text):
for token, tag in sentence:
tokens_tags.append((token, tag, ))
return tokens_tags
except ConnectionRefusedError as e:
print(e)
print("TurboTagger: problem with the NLPServer (try running gargantext/parsing/Taggers/lib/nlpserver/server.py)")
# TODO abort workflow?
return []
......@@ -105,6 +105,8 @@ def parse_extract_indexhyperdata(corpus):
grouplist_id=group_id)
print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
print('CORPUS #%d: [%s] FINISHED ngram lists computation' % (corpus.id, t()))
def t():
return datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
......@@ -81,16 +81,20 @@ def do_maplist(corpus,
.limit(multigrams_limit)
.all()
)
print("MAPLIST: top_monograms =", len(top_monograms))
print("MAPLIST: top_multigrams = ", len(top_multigrams))
obtained_mono = len(top_monograms)
obtained_multi = len(top_multigrams)
obtained_total = obtained_mono + obtained_multi
# print("MAPLIST: top_monograms =", obtained_mono)
# print("MAPLIST: top_multigrams = ", obtained_multi)
print("MAPLIST: kept %i ngrams in total " % obtained_total)
# NEW MAPLIST NODE
# -----------------
# saving the parameters of the analysis in the Node JSON
new_hyperdata = { 'corpus': corpus.id,
'limit' : limit,
'monograms_part' : monograms_part
'monograms_part' : monograms_part,
'monograms_result' : obtained_mono/obtained_total
}
if overwrite_id:
# overwrite pre-existing node
......
......@@ -4,6 +4,7 @@ from gargantext.constants import *
from gargantext.util.ngramsextractors import ngramsextractors
from collections import defaultdict
from re import sub
from gargantext.util.scheduling import scheduled
......@@ -32,7 +33,7 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
db.commit()
def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstract', )):
def extract_ngrams(corpus, keys=('title', 'abstract', )):
"""Extract ngrams for every document below the given corpus.
Default language is given by the resource type.
The result is then inserted into database.
......@@ -62,7 +63,7 @@ def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstr
# get ngrams
for ngram in ngramsextractor.extract(value):
tokens = tuple(token[0] for token in ngram)
terms = ' '.join(tokens)
terms = normalize_terms(' '.join(tokens))
nodes_ngrams_count[(document.id, terms)] += 1
ngrams_data.add((terms[:255], len(tokens), ))
# integrate ngrams and nodes-ngrams
......@@ -84,3 +85,21 @@ def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstr
corpus.save_hyperdata()
session.commit()
raise error
def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
"""
Removes unwanted trailing punctuation
AND optionally puts everything to lowercase
ex /'ecosystem services'/ => /ecosystem services/
(benefits from normalize_chars upstream so there's less cases to consider)
"""
term_str = sub(r'^[-",;/%(){}\\\[\]\.\']+', '', term_str)
term_str = sub(r'[-",;/%(){}\\\[\]\.\']+$', '', term_str)
if do_lowercase:
term_str = term_str.lower()
return term_str
......@@ -3,6 +3,7 @@ from gargantext.models import *
from gargantext.constants import *
from collections import defaultdict
from re import sub
def parse(corpus):
try:
......@@ -21,14 +22,26 @@ def parse(corpus):
resource_path = resource['path']
# extract and insert documents from corpus resource into database
for hyperdata in resource_parser(resource_path):
# uniformize the text values for easier POStagging and processing
for k in ['abstract', 'title']:
if k in hyperdata:
hyperdata[k] = normalize_chars(hyperdata[k])
# save as DB child
# ----------------
document = corpus.add_child(
typename = 'DOCUMENT',
name = hyperdata.get('title', '')[:255],
hyperdata = hyperdata,
)
session.add(document)
# a simple census to raise language info at corpus level
if "language_iso2" in hyperdata:
observed_languages[hyperdata["language_iso2"]] += 1
# logging
if documents_count % BATCH_PARSING_SIZE == 0:
corpus.status('parsing', progress=documents_count)
corpus.save_hyperdata()
......@@ -47,3 +60,116 @@ def parse(corpus):
corpus.save_hyperdata()
session.commit()
raise error
def normalize_chars(my_str):
"""
Simplification des chaînes de caractères en entrée de la BDD
("les caractères qu'on voudrait ne jamais voir")
- normalisation
> espaces
> tirets
> guillemets
- déligatures
NB: cette normalisation du texte en entrée ne va pas enlever les ponctuations
mais les transcoder en une forme plus canonique pour réduire leur diversité
(autres traitements plus invasifs, comme enlever les guillemets
ou passer en lowercase, seront à placer plutôt *après* le tagger,
cf. toolchain.ngrams_extraction.normalize_terms)
"""
# --------------
# E S P A C E S
# --------------
# tous les caractères de contrôle (dont \t = \x{0009}, \n = \x{000A} et \r = \x{000D}) --> espace
my_str = sub(r'[\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F\u007F]', ' ', my_str)
# Line separator
my_str = sub(r'\u2028',' ', my_str)
my_str = sub(r'\u2029',' ', my_str)
# U+0092: parfois quote parfois cara de contrôle
my_str = sub(r'\u0092', ' ', my_str)
# tous les espaces alternatifs --> espace
my_str = sub(r'[\u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u3000\uFEFF]', ' ' , my_str)
# pour finir on enlève les espaces en trop
# (dits "trailing spaces")
my_str = sub(r'\s+', ' ', my_str)
my_str = sub(r'^\s', '', my_str)
my_str = sub(r'\s$', '', my_str)
# ------------------------
# P O N C T U A T I O N S
# ------------------------
# la plupart des tirets alternatifs --> tiret normal (dit "du 6")
# (dans l'ordre U+002D U+2010 U+2011 U+2012 U+2013 U+2014 U+2015 U+2212 U+FE63)
my_str = sub(r'[‐‑‒–—―−﹣]','-', my_str)
# le macron aussi parfois comme tiret
my_str = sub(r'\u00af','-', my_str)
# Guillemets
# ----------
# la plupart des quotes simples --> ' APOSTROPHE
my_str = sub(r"‘’‚`‛", "'", my_str) # U+2018 U+2019 U+201a U+201b
my_str = sub(r'‹ ?',"'", my_str) # U+2039 plus espace éventuel après
my_str = sub(r' ?›',"'", my_str) # U+203A plus espace éventuel avant
# la plupart des quotes doubles --> " QUOTATION MARK
my_str = sub(r'“”„‟', '"', my_str) # U+201C U+201D U+201E U+201F
my_str = sub(r'« ?', '"', my_str) # U+20AB plus espace éventuel après
my_str = sub(r' ?»', '"', my_str) # U+20AB plus espace éventuel avant
# deux quotes simples (préparées ci-dessus) => une double
my_str = sub(r"''", '"', my_str)
# Autres
# -------
my_str = sub(r'…', '...', my_str)
# paragraph separator utilisé parfois comme '...'
my_str = sub(r'\u0085', '...', my_str)
my_str = sub(r'€', 'EUR', my_str)
# quelques puces courantes (bullets)
my_str = sub(r'▪', '*', my_str)
my_str = sub(r'►', '*', my_str)
my_str = sub(r'●', '*', my_str)
my_str = sub(r'◘', '*', my_str)
my_str = sub(r'→', '*', my_str)
my_str = sub(r'•', '*', my_str)
my_str = sub(r'·', '*', my_str)
# ------------------
# L I G A T U R E S
# ------------------
my_str = sub(r'Ꜳ', 'AA', my_str)
my_str = sub(r'ꜳ', 'aa', my_str)
my_str = sub(r'Æ', 'AE', my_str)
my_str = sub(r'æ', 'ae', my_str)
my_str = sub(r'DZ', 'DZ', my_str)
my_str = sub(r'Dz', 'Dz', my_str)
my_str = sub(r'dz', 'dz', my_str)
my_str = sub(r'ffi', 'ffi', my_str)
my_str = sub(r'ff', 'ff', my_str)
my_str = sub(r'fi', 'fi', my_str)
my_str = sub(r'ffl', 'ffl', my_str)
my_str = sub(r'fl', 'fl', my_str)
my_str = sub(r'ſt', 'ft', my_str)
my_str = sub(r'IJ', 'IJ', my_str)
my_str = sub(r'ij', 'ij', my_str)
my_str = sub(r'LJ', 'LJ', my_str)
my_str = sub(r'lj', 'lj', my_str)
my_str = sub(r'NJ', 'NJ', my_str)
my_str = sub(r'nj', 'nj', my_str)
my_str = sub(r'Œ', 'OE', my_str)
my_str = sub(r'œ', 'oe', my_str)
my_str = sub(r'\u009C', 'oe', my_str) # U+009C (cara contrôle vu comme oe)
my_str = sub(r'st', 'st', my_str)
my_str = sub(r'Ꜩ', 'Tz', my_str)
my_str = sub(r'ꜩ', 'tz', my_str)
return my_str
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment