Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
37d2686d
Commit
37d2686d
authored
Jan 14, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FIX] add string harmonization as pretreatment for POStaggers (TODO better tokenization
parent
daf4e9d3
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
128 additions
and
2 deletions
+128
-2
corpustools.py
parsing/corpustools.py
+128
-2
No files found.
parsing/corpustools.py
View file @
37d2686d
...
...
@@ -11,6 +11,8 @@ from gargantext_web.db import *
from
.parsers_config
import
parsers
as
_parsers
from
ngram.tools
import
insert_ngrams
from
re
import
sub
# keep all the parsers in a cache
class
Parsers
(
defaultdict
):
def
__init__
(
self
):
...
...
@@ -239,10 +241,20 @@ def extract_ngrams(corpus, keys, nlp=True):
ngramsextractor
=
ngramsextractors
[
language_iso2
]
for
text
in
nodeinfo
[
2
:]:
if
text
is
not
None
and
len
(
text
):
# normalisation caras (harmonisation guillemets etc)
clean_text
=
text_prepa
(
text
)
.
lower
()
# TOTEST sent_tokenize (with nltk models) => [sentences]
# then: for sentence => ngramsextract => ngrams.append()
# TOTEST " ".join(wordpunct_tokenize(text)) pour éviter:
# - "vvsp+vvmi/vvmf"
# - "-stimulated increase"
if
nlp
==
True
:
ngrams
=
ngramsextractor
.
extract_ngrams
(
text
.
replace
(
"["
,
""
)
.
replace
(
"]"
,
""
)
)
ngrams
=
ngramsextractor
.
extract_ngrams
(
clean_text
)
else
:
ngrams
=
wordpunct_tokenize
(
text
.
lower
()
)
ngrams
=
wordpunct_tokenize
(
clean_text
)
for
ngram
in
ngrams
:
if
nlp
==
True
:
...
...
@@ -286,3 +298,117 @@ def extract_ngrams(corpus, keys, nlp=True):
# commit to database
db
.
commit
()
def
text_prepa
(
my_str
):
"""
Simplification des chaînes de caractères pour le tagging
- suppression/normalisation espaces et ponctuations
- jonction césures,
- déligatures
- tout en min
(c) rloth 2010 - 2016
"""
# --------------
# E S P A C E S
# --------------
# tous les caractères de contrôle (dont \t = \x{0009}, \n = \x{000A} et \r = \x{000D}) --> espace
my_str
=
sub
(
r'[\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F\u007F]'
,
' '
,
my_str
)
# Line separator
my_str
=
sub
(
r'\u2028'
,
' '
,
my_str
)
my_str
=
sub
(
r'\u2029'
,
' '
,
my_str
)
# U+0092: parfois quote parfois cara de contrôle
my_str
=
sub
(
r'\u0092'
,
' '
,
my_str
)
# tous les espaces alternatifs --> espace
my_str
=
sub
(
r'[\u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u3000\uFEFF]'
,
' '
,
my_str
)
# pour finir on enlève les espaces en trop
# (dits "trailing spaces")
my_str
=
sub
(
r'\s+'
,
' '
,
my_str
)
my_str
=
sub
(
r'^\s'
,
''
,
my_str
)
my_str
=
sub
(
r'\s$'
,
''
,
my_str
)
# ------------------------
# P O N C T U A T I O N S
# ------------------------
# la plupart des tirets alternatifs --> tiret normal (dit "du 6")
# (dans l'ordre U+002D U+2010 U+2011 U+2012 U+2013 U+2014 U+2015 U+2212 U+FE63)
my_str
=
sub
(
r'[‐‑‒–—―−﹣]'
,
'-'
,
my_str
)
# le macron aussi parfois comme tiret
# (mais compatibilité avec desaccent ?)
my_str
=
sub
(
r'\u00af'
,
'-'
,
my_str
)
# Guillemets
# ----------
# la plupart des quotes simples --> ' APOSTROPHE
my_str
=
sub
(
r"‘’‚`‛"
,
"'"
,
my_str
)
# U+2018 U+2019 U+201a U+201b
my_str
=
sub
(
r'‹ ?'
,
"'"
,
my_str
)
# U+2039 plus espace éventuel après
my_str
=
sub
(
r' ?›'
,
"'"
,
my_str
)
# U+203A plus espace éventuel avant
# la plupart des quotes doubles --> " QUOTATION MARK
my_str
=
sub
(
r'“”„‟'
,
'"'
,
my_str
)
# U+201C U+201D U+201E U+201F
my_str
=
sub
(
r'« ?'
,
'"'
,
my_str
)
# U+20AB plus espace éventuel après
my_str
=
sub
(
r' ?»'
,
'"'
,
my_str
)
# U+20AB plus espace éventuel avant
# deux quotes simples (préparées ci-dessus) => une double
my_str
=
sub
(
r"''"
,
'"'
,
my_str
)
# Autres
# -------
my_str
=
sub
(
r'…'
,
'...'
,
my_str
)
# paragraph separator utilisé parfois comme '...'
my_str
=
sub
(
r'\u0085'
,
'...'
,
my_str
)
my_str
=
sub
(
r'€'
,
'EUR'
,
my_str
)
# quelques puces courantes (bullets)
my_str
=
sub
(
r'▪'
,
'*'
,
my_str
)
my_str
=
sub
(
r'►'
,
'*'
,
my_str
)
my_str
=
sub
(
r'●'
,
'*'
,
my_str
)
my_str
=
sub
(
r'◘'
,
'*'
,
my_str
)
my_str
=
sub
(
r'→'
,
'*'
,
my_str
)
my_str
=
sub
(
r'•'
,
'*'
,
my_str
)
my_str
=
sub
(
r'·'
,
'*'
,
my_str
)
my_str
=
sub
(
r'☽'
,
'*'
,
my_str
)
# --------------
# C E S U R E S
# --------------
# NB: pré-suppose déjà: tr '\n' ' ' et normalisation des tirets
my_str
=
sub
(
r'(?<=\w)- '
,
'-'
,
my_str
)
# version light avec tiret préservé
# ------------------
# L I G A T U R E S
# ------------------
my_str
=
sub
(
r'Ꜳ'
,
'AA'
,
my_str
)
my_str
=
sub
(
r'ꜳ'
,
'aa'
,
my_str
)
my_str
=
sub
(
r'Æ'
,
'AE'
,
my_str
)
my_str
=
sub
(
r'æ'
,
'ae'
,
my_str
)
my_str
=
sub
(
r'DZ'
,
'DZ'
,
my_str
)
my_str
=
sub
(
r'Dz'
,
'Dz'
,
my_str
)
my_str
=
sub
(
r'dz'
,
'dz'
,
my_str
)
my_str
=
sub
(
r'ffi'
,
'ffi'
,
my_str
)
my_str
=
sub
(
r'ff'
,
'ff'
,
my_str
)
my_str
=
sub
(
r'fi'
,
'fi'
,
my_str
)
my_str
=
sub
(
r'ffl'
,
'ffl'
,
my_str
)
my_str
=
sub
(
r'fl'
,
'fl'
,
my_str
)
my_str
=
sub
(
r'ſt'
,
'ft'
,
my_str
)
my_str
=
sub
(
r'IJ'
,
'IJ'
,
my_str
)
my_str
=
sub
(
r'ij'
,
'ij'
,
my_str
)
my_str
=
sub
(
r'LJ'
,
'LJ'
,
my_str
)
my_str
=
sub
(
r'lj'
,
'lj'
,
my_str
)
my_str
=
sub
(
r'NJ'
,
'NJ'
,
my_str
)
my_str
=
sub
(
r'nj'
,
'nj'
,
my_str
)
my_str
=
sub
(
r'Œ'
,
'OE'
,
my_str
)
my_str
=
sub
(
r'œ'
,
'oe'
,
my_str
)
my_str
=
sub
(
r'\u009C'
,
'oe'
,
my_str
)
# U+009C (cara contrôle vu comme oe)
my_str
=
sub
(
r'st'
,
'st'
,
my_str
)
my_str
=
sub
(
r'Ꜩ'
,
'Tz'
,
my_str
)
my_str
=
sub
(
r'ꜩ'
,
'tz'
,
my_str
)
return
my_str
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment