Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
5499546e
Commit
5499546e
authored
Mar 24, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
[FIX] merge.
parents
f64176b7
818c54d5
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
187 additions
and
19 deletions
+187
-19
constants.py
gargantext/constants.py
+11
-2
ngramsextractors.py
gargantext/util/ngramsextractors.py
+5
-3
TreeTagger.py
gargantext/util/taggers/TreeTagger.py
+3
-3
TurboTagger.py
gargantext/util/taggers/TurboTagger.py
+11
-5
__init__.py
gargantext/util/toolchain/__init__.py
+2
-0
list_map.py
gargantext/util/toolchain/list_map.py
+8
-4
ngrams_extraction.py
gargantext/util/toolchain/ngrams_extraction.py
+21
-2
parsing.py
gargantext/util/toolchain/parsing.py
+126
-0
No files found.
gargantext/constants.py
View file @
5499546e
...
...
@@ -172,13 +172,22 @@ DEFAULT_TFIDF_CUTOFF_RATIO = .45 # MAINLIST maximum terms in %
DEFAULT_TFIDF_HARD_LIMIT
=
750
# MAINLIST maximum terms abs
# (makes COOCS larger ~ O(N²) /!\)
DEFAULT_COOC_THRESHOLD
=
5
# inclusive minimum for COOCS coefs
DEFAULT_COOC_THRESHOLD
=
3
# inclusive minimum for COOCS coefs
# (makes COOCS more sparse)
DEFAULT_MAPLIST_MAX
=
300
# MAPLIST maximum terms
DEFAULT_MAPLIST_MONOGRAMS_RATIO
=
.5
# part of monograms in MAPLIST
# (NB: used to be 0.005 !!)
DEFAULT_MAX_NGRAM_LEN
=
7
# limit used after POStagging rule
# (initial ngrams number is a power law of this /!\)
# (and most longer ngrams have tiny freq anyway)
DEFAULT_ALL_LOWERCASE_FLAG
=
True
# lowercase ngrams before recording
# them to their DB table
# (potentially bad for acronyms but
# good for variants like same term
# occurring at sentence beginning)
# ------------------------------------------------------------------------------
...
...
gargantext/util/ngramsextractors.py
View file @
5499546e
from
gargantext.util.languages
import
languages
from
gargantext.constants
import
LANGUAGES
from
gargantext.constants
import
LANGUAGES
,
DEFAULT_MAX_NGRAM_LEN
import
nltk
import
re
...
...
@@ -17,7 +17,7 @@ class NgramsExtractor:
"""
return
re
.
sub
(
r'<[^>]{0,45}>'
,
''
,
text
)
def
extract
(
self
,
text
,
rule
=
'{<JJ.*>*<NN.*>+
<JJ.*>*}'
,
label
=
'NP'
):
def
extract
(
self
,
text
,
rule
=
'{<JJ.*>*<NN.*>+
((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}'
,
label
=
'NP'
,
max_n_words
=
DEFAULT_MAX_NGRAM_LEN
):
text
=
self
.
clean_text
(
text
)
grammar
=
nltk
.
RegexpParser
(
label
+
': '
+
rule
)
tagged_tokens
=
list
(
self
.
_tagger
.
tag_text
(
text
))
...
...
@@ -25,7 +25,9 @@ class NgramsExtractor:
grammar_parsed
=
grammar
.
parse
(
tagged_tokens
)
for
subtree
in
grammar_parsed
.
subtrees
():
if
subtree
.
label
()
==
label
:
yield
subtree
.
leaves
()
if
len
(
subtree
)
<
max_n_words
:
yield
subtree
.
leaves
()
# ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
class
NgramsExtractors
(
dict
):
...
...
gargantext/util/taggers/TreeTagger.py
View file @
5499546e
...
...
@@ -15,9 +15,9 @@ class identity_dict(dict):
_tag_replacements
=
identity_dict
({
"NOM"
:
"NN"
,
"NAM"
:
"NN"
,
"ADJ"
:
"
NN
"
,
"VER"
:
"
JJ
"
,
"PREP"
:
"
PRP
"
,
"ADJ"
:
"
JJ
"
,
"VER"
:
"
VB
"
,
"PREP"
:
"
IN
"
,
"KON"
:
"CC"
,
"DET"
:
"DT"
,
"PRO"
:
"DT"
,
...
...
gargantext/util/taggers/TurboTagger.py
View file @
5499546e
...
...
@@ -14,8 +14,14 @@ class TurboTagger:
def
tag_text
(
self
,
text
):
if
not
hasattr
(
self
,
'_nlpclient'
):
self
.
_nlpclient
=
NLPClient
()
tokens_tags
=
[]
for
sentence
in
self
.
_nlpclient
.
tag
(
text
):
for
token
,
tag
in
sentence
:
tokens_tags
.
append
((
token
,
tag
,
))
return
tokens_tags
try
:
tokens_tags
=
[]
for
sentence
in
self
.
_nlpclient
.
tag
(
text
):
for
token
,
tag
in
sentence
:
tokens_tags
.
append
((
token
,
tag
,
))
return
tokens_tags
except
ConnectionRefusedError
as
e
:
print
(
e
)
print
(
"TurboTagger: problem with the NLPServer (try running gargantext/parsing/Taggers/lib/nlpserver/server.py)"
)
# TODO abort workflow?
return
[]
gargantext/util/toolchain/__init__.py
View file @
5499546e
...
...
@@ -105,6 +105,8 @@ def parse_extract_indexhyperdata(corpus):
grouplist_id
=
group_id
)
print
(
'CORPUS #
%
d: [
%
s] new maplist node #
%
i'
%
(
corpus
.
id
,
t
(),
map_id
))
print
(
'CORPUS #
%
d: [
%
s] FINISHED ngram lists computation'
%
(
corpus
.
id
,
t
()))
def
t
():
return
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d_
%
H:
%
M:
%
S"
)
gargantext/util/toolchain/list_map.py
View file @
5499546e
...
...
@@ -81,16 +81,20 @@ def do_maplist(corpus,
.
limit
(
multigrams_limit
)
.
all
()
)
print
(
"MAPLIST: top_monograms ="
,
len
(
top_monograms
))
print
(
"MAPLIST: top_multigrams = "
,
len
(
top_multigrams
))
obtained_mono
=
len
(
top_monograms
)
obtained_multi
=
len
(
top_multigrams
)
obtained_total
=
obtained_mono
+
obtained_multi
# print("MAPLIST: top_monograms =", obtained_mono)
# print("MAPLIST: top_multigrams = ", obtained_multi)
print
(
"MAPLIST: kept
%
i ngrams in total "
%
obtained_total
)
# NEW MAPLIST NODE
# -----------------
# saving the parameters of the analysis in the Node JSON
new_hyperdata
=
{
'corpus'
:
corpus
.
id
,
'limit'
:
limit
,
'monograms_part'
:
monograms_part
'monograms_part'
:
monograms_part
,
'monograms_result'
:
obtained_mono
/
obtained_total
}
if
overwrite_id
:
# overwrite pre-existing node
...
...
gargantext/util/toolchain/ngrams_extraction.py
View file @
5499546e
...
...
@@ -4,6 +4,7 @@ from gargantext.constants import *
from
gargantext.util.ngramsextractors
import
ngramsextractors
from
collections
import
defaultdict
from
re
import
sub
from
gargantext.util.scheduling
import
scheduled
...
...
@@ -32,7 +33,7 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
db
.
commit
()
def
extract_ngrams
(
corpus
,
rule
=
'{<JJ.*>*<NN.*>+<JJ.*>*}'
,
keys
=
(
'title'
,
'abstract'
,
)):
def
extract_ngrams
(
corpus
,
keys
=
(
'title'
,
'abstract'
,
)):
"""Extract ngrams for every document below the given corpus.
Default language is given by the resource type.
The result is then inserted into database.
...
...
@@ -62,7 +63,7 @@ def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstr
# get ngrams
for
ngram
in
ngramsextractor
.
extract
(
value
):
tokens
=
tuple
(
token
[
0
]
for
token
in
ngram
)
terms
=
' '
.
join
(
tokens
)
terms
=
normalize_terms
(
' '
.
join
(
tokens
)
)
nodes_ngrams_count
[(
document
.
id
,
terms
)]
+=
1
ngrams_data
.
add
((
terms
[:
255
],
len
(
tokens
),
))
# integrate ngrams and nodes-ngrams
...
...
@@ -84,3 +85,21 @@ def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstr
corpus
.
save_hyperdata
()
session
.
commit
()
raise
error
def
normalize_terms
(
term_str
,
do_lowercase
=
DEFAULT_ALL_LOWERCASE_FLAG
):
"""
Removes unwanted trailing punctuation
AND optionally puts everything to lowercase
ex /'ecosystem services'/ => /ecosystem services/
(benefits from normalize_chars upstream so there's less cases to consider)
"""
term_str
=
sub
(
r'^[-",;/
%
(){}\\\[\]\.\']+'
,
''
,
term_str
)
term_str
=
sub
(
r'[-",;/
%
(){}\\\[\]\.\']+$'
,
''
,
term_str
)
if
do_lowercase
:
term_str
=
term_str
.
lower
()
return
term_str
gargantext/util/toolchain/parsing.py
View file @
5499546e
...
...
@@ -3,6 +3,7 @@ from gargantext.models import *
from
gargantext.constants
import
*
from
collections
import
defaultdict
from
re
import
sub
def
parse
(
corpus
):
try
:
...
...
@@ -21,14 +22,26 @@ def parse(corpus):
resource_path
=
resource
[
'path'
]
# extract and insert documents from corpus resource into database
for
hyperdata
in
resource_parser
(
resource_path
):
# uniformize the text values for easier POStagging and processing
for
k
in
[
'abstract'
,
'title'
]:
if
k
in
hyperdata
:
hyperdata
[
k
]
=
normalize_chars
(
hyperdata
[
k
])
# save as DB child
# ----------------
document
=
corpus
.
add_child
(
typename
=
'DOCUMENT'
,
name
=
hyperdata
.
get
(
'title'
,
''
)[:
255
],
hyperdata
=
hyperdata
,
)
session
.
add
(
document
)
# a simple census to raise language info at corpus level
if
"language_iso2"
in
hyperdata
:
observed_languages
[
hyperdata
[
"language_iso2"
]]
+=
1
# logging
if
documents_count
%
BATCH_PARSING_SIZE
==
0
:
corpus
.
status
(
'parsing'
,
progress
=
documents_count
)
corpus
.
save_hyperdata
()
...
...
@@ -47,3 +60,116 @@ def parse(corpus):
corpus
.
save_hyperdata
()
session
.
commit
()
raise
error
def
normalize_chars
(
my_str
):
"""
Simplification des chaînes de caractères en entrée de la BDD
("les caractères qu'on voudrait ne jamais voir")
- normalisation
> espaces
> tirets
> guillemets
- déligatures
NB: cette normalisation du texte en entrée ne va pas enlever les ponctuations
mais les transcoder en une forme plus canonique pour réduire leur diversité
(autres traitements plus invasifs, comme enlever les guillemets
ou passer en lowercase, seront à placer plutôt *après* le tagger,
cf. toolchain.ngrams_extraction.normalize_terms)
"""
# --------------
# E S P A C E S
# --------------
# tous les caractères de contrôle (dont \t = \x{0009}, \n = \x{000A} et \r = \x{000D}) --> espace
my_str
=
sub
(
r'[\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F\u007F]'
,
' '
,
my_str
)
# Line separator
my_str
=
sub
(
r'\u2028'
,
' '
,
my_str
)
my_str
=
sub
(
r'\u2029'
,
' '
,
my_str
)
# U+0092: parfois quote parfois cara de contrôle
my_str
=
sub
(
r'\u0092'
,
' '
,
my_str
)
# tous les espaces alternatifs --> espace
my_str
=
sub
(
r'[\u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u3000\uFEFF]'
,
' '
,
my_str
)
# pour finir on enlève les espaces en trop
# (dits "trailing spaces")
my_str
=
sub
(
r'\s+'
,
' '
,
my_str
)
my_str
=
sub
(
r'^\s'
,
''
,
my_str
)
my_str
=
sub
(
r'\s$'
,
''
,
my_str
)
# ------------------------
# P O N C T U A T I O N S
# ------------------------
# la plupart des tirets alternatifs --> tiret normal (dit "du 6")
# (dans l'ordre U+002D U+2010 U+2011 U+2012 U+2013 U+2014 U+2015 U+2212 U+FE63)
my_str
=
sub
(
r'[‐‑‒–—―−﹣]'
,
'-'
,
my_str
)
# le macron aussi parfois comme tiret
my_str
=
sub
(
r'\u00af'
,
'-'
,
my_str
)
# Guillemets
# ----------
# la plupart des quotes simples --> ' APOSTROPHE
my_str
=
sub
(
r"‘’‚`‛"
,
"'"
,
my_str
)
# U+2018 U+2019 U+201a U+201b
my_str
=
sub
(
r'‹ ?'
,
"'"
,
my_str
)
# U+2039 plus espace éventuel après
my_str
=
sub
(
r' ?›'
,
"'"
,
my_str
)
# U+203A plus espace éventuel avant
# la plupart des quotes doubles --> " QUOTATION MARK
my_str
=
sub
(
r'“”„‟'
,
'"'
,
my_str
)
# U+201C U+201D U+201E U+201F
my_str
=
sub
(
r'« ?'
,
'"'
,
my_str
)
# U+20AB plus espace éventuel après
my_str
=
sub
(
r' ?»'
,
'"'
,
my_str
)
# U+20AB plus espace éventuel avant
# deux quotes simples (préparées ci-dessus) => une double
my_str
=
sub
(
r"''"
,
'"'
,
my_str
)
# Autres
# -------
my_str
=
sub
(
r'…'
,
'...'
,
my_str
)
# paragraph separator utilisé parfois comme '...'
my_str
=
sub
(
r'\u0085'
,
'...'
,
my_str
)
my_str
=
sub
(
r'€'
,
'EUR'
,
my_str
)
# quelques puces courantes (bullets)
my_str
=
sub
(
r'▪'
,
'*'
,
my_str
)
my_str
=
sub
(
r'►'
,
'*'
,
my_str
)
my_str
=
sub
(
r'●'
,
'*'
,
my_str
)
my_str
=
sub
(
r'◘'
,
'*'
,
my_str
)
my_str
=
sub
(
r'→'
,
'*'
,
my_str
)
my_str
=
sub
(
r'•'
,
'*'
,
my_str
)
my_str
=
sub
(
r'·'
,
'*'
,
my_str
)
# ------------------
# L I G A T U R E S
# ------------------
my_str
=
sub
(
r'Ꜳ'
,
'AA'
,
my_str
)
my_str
=
sub
(
r'ꜳ'
,
'aa'
,
my_str
)
my_str
=
sub
(
r'Æ'
,
'AE'
,
my_str
)
my_str
=
sub
(
r'æ'
,
'ae'
,
my_str
)
my_str
=
sub
(
r'DZ'
,
'DZ'
,
my_str
)
my_str
=
sub
(
r'Dz'
,
'Dz'
,
my_str
)
my_str
=
sub
(
r'dz'
,
'dz'
,
my_str
)
my_str
=
sub
(
r'ffi'
,
'ffi'
,
my_str
)
my_str
=
sub
(
r'ff'
,
'ff'
,
my_str
)
my_str
=
sub
(
r'fi'
,
'fi'
,
my_str
)
my_str
=
sub
(
r'ffl'
,
'ffl'
,
my_str
)
my_str
=
sub
(
r'fl'
,
'fl'
,
my_str
)
my_str
=
sub
(
r'ſt'
,
'ft'
,
my_str
)
my_str
=
sub
(
r'IJ'
,
'IJ'
,
my_str
)
my_str
=
sub
(
r'ij'
,
'ij'
,
my_str
)
my_str
=
sub
(
r'LJ'
,
'LJ'
,
my_str
)
my_str
=
sub
(
r'lj'
,
'lj'
,
my_str
)
my_str
=
sub
(
r'NJ'
,
'NJ'
,
my_str
)
my_str
=
sub
(
r'nj'
,
'nj'
,
my_str
)
my_str
=
sub
(
r'Œ'
,
'OE'
,
my_str
)
my_str
=
sub
(
r'œ'
,
'oe'
,
my_str
)
my_str
=
sub
(
r'\u009C'
,
'oe'
,
my_str
)
# U+009C (cara contrôle vu comme oe)
my_str
=
sub
(
r'st'
,
'st'
,
my_str
)
my_str
=
sub
(
r'Ꜩ'
,
'Tz'
,
my_str
)
my_str
=
sub
(
r'ꜩ'
,
'tz'
,
my_str
)
return
my_str
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment