Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
67cd43b0
Commit
67cd43b0
authored
Jul 28, 2016
by
c24b
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
LANG DETECTION IN PARSING
parent
b192ddd8
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
56 additions
and
33 deletions
+56
-33
constants.py
gargantext/constants.py
+3
-1
requirements.pip
gargantext/requirements.pip
+1
-0
languages.py
gargantext/util/languages.py
+9
-2
parsing.py
gargantext/util/toolchain/parsing.py
+43
-30
debug
install/gargamelle/debug
+0
-0
No files found.
gargantext/constants.py
View file @
67cd43b0
...
...
@@ -315,7 +315,9 @@ DEFAULT_INDEX_SUBGRAMS = False # False <=> traditional
# "cool example".
# (all 1 to n-1 length ngrams,
# at indexing after extraction)
DEFAULT_INDEX_FIELDS
=
(
'title'
,
'abstract'
,
)
#Defaults Fields for ngrams extraction
# Defaults INDEXED Fields for ngrams extraction
# put longest field first in order to make detection language more efficient
DEFAULT_INDEX_FIELDS
=
(
'abstract'
,
'title'
)
# Grammar rules for chunking
RULE_JJNN
=
"{<JJ.*>*<NN.*|>+<JJ.*>*}"
RULE_JJDTNN
=
"{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}"
...
...
gargantext/requirements.pip
View file @
67cd43b0
...
...
@@ -14,6 +14,7 @@ djangorestframework==3.3.2
html5lib==0.9999999
jdatetime==1.7.2
kombu==3.0.33
langdetect==1.0.6
lxml==3.5.0
networkx==1.11
nltk==3.1
...
...
gargantext/util/languages.py
View file @
67cd43b0
from
gargantext.constants
import
*
from
langdetect
import
detect
from
langdetect
import
DetectorFactory
class
Language
:
def
__init__
(
self
,
iso2
=
None
,
iso3
=
None
,
name
=
None
):
def
__init__
(
self
,
iso2
=
None
,
iso3
=
None
,
full_name
=
None
,
name
=
None
):
self
.
iso2
=
iso2
self
.
iso3
=
iso3
self
.
name
=
name
self
.
implemented
=
iso2
in
LANGUAGES
def
__str__
(
self
):
result
=
'<Language'
for
key
,
value
in
self
.
__dict__
.
items
():
...
...
@@ -16,6 +18,10 @@ class Language:
return
result
__repr__
=
__str__
def
detect_lang
(
self
,
text
):
DetectorFactory
.
seed
=
0
return
Languages
[
detect
(
text
)]
.
iso2
class
Languages
(
dict
):
def
__missing__
(
self
,
key
):
key
=
key
.
lower
()
...
...
@@ -49,3 +55,4 @@ languages['fre'] = languages['fr']
languages
[
'ger'
]
=
languages
[
'de'
]
languages
[
'Français'
]
=
languages
[
'fr'
]
languages
[
'en_US'
]
=
languages
[
'en'
]
languages
[
'english'
]
=
languages
[
'en'
]
gargantext/util/toolchain/parsing.py
View file @
67cd43b0
...
...
@@ -4,6 +4,7 @@ from gargantext.constants import *
#from gargantext.util.parsers import *
from
collections
import
defaultdict
,
Counter
from
re
import
sub
from
gargantext.util.languages
import
languages
,
detect_lang
def
parse
(
corpus
):
try
:
...
...
@@ -27,8 +28,8 @@ def parse(corpus):
else
:
#observed langages in corpus docs
corpus
.
languages
=
defaultdict
.
fromkeys
(
source
[
"default_languages"
],
0
)
skipped_languages
=
[]
#remember the skipped docs in parsing
skipped_languages
=
[]
corpus
.
skipped_docs
=
[]
session
.
add
(
corpus
)
session
.
commit
()
...
...
@@ -43,34 +44,50 @@ def parse(corpus):
hyperdata
[
k
]
=
normalize_chars
(
hyperdata
[
k
])
except
Exception
as
error
:
hyperdata
[
"error"
]
=
"Error normalize_chars"
indexed
=
False
# a simple census to raise language info at corpus level
if
"language_iso2"
in
hyperdata
.
keys
():
try
:
corpus
.
languages
[
hyperdata
[
"language_iso2"
]]
+=
1
except
KeyError
:
print
(
"KeyError"
,
hyperdata
[
"language_iso2"
])
hyperdata
[
"error"
]
=
"Error: unsupported language"
skipped_languages
.
append
(
hyperdata
[
"language_iso2"
])
elif
"language_fullname"
in
hyperdata
.
keys
():
try
:
#full => iso2
lang
=
languages
[
hyperdata
[
"language_fullname"
]]
.
name
.
lower
()
corpus
.
languages
[
lang
]
+=
1
except
KeyError
:
print
(
"KeyError"
,
hyperdata
[
"language_fullname"
])
hyperdata
[
"error"
]
=
"Error: unsupported language"
skipped_languages
.
append
(
lang
)
else
:
pass
for
l
in
[
"iso2"
,
"iso3"
,
"full_name"
]:
if
hyperdata
[
"indexed"
]
is
True
:
break
lang_field
=
"language_"
+
l
if
lang_field
in
hyperdata
.
keys
():
if
l
==
"iso2"
:
try
:
corpus
.
languages
[
hyperdata
[
"language_iso2"
]]
+=
1
indexed
=
True
except
KeyError
:
hyperdata
[
"error"
]
=
"Error: unsupported language"
skipped_languages
.
append
(
hyperdata
[
"language_iso2"
])
else
:
try
:
lang
=
languages
(
hyperdata
[
lang_field
]
.
lower
())
.
iso2
corpus
.
languages
[
lang
]
+=
1
indexed
=
True
except
KeyError
:
hyperdata
[
"error"
]
=
"Error: unsupported language"
skipped_languages
.
append
(
lang
)
if
indexed
is
False
:
#no language have been indexed
#detectlang by index_fields
# for k in DEFAULT_INDEX_FIELDS:
# if k in hyperdata.keys():
# try:
# hyperdata["language_iso2"] = langdetect(hyperdata[k])
# except Exception as error :
# pass
#print(hyperdata.keys())
for
k
in
DEFAULT_INDEX_FIELDS
:
if
indexed
is
True
:
break
if
k
in
hyperdata
.
keys
():
try
:
hyperdata
[
"language_iso2"
]
=
detect_lang
(
hyperdata
[
k
])
corpus
.
languages
[
lang
]
+=
1
indexed
=
True
break
except
KeyError
:
hyperdata
[
"error"
]
=
"Error: unsupported language"
skipped_languages
.
append
(
hyperdata
[
"language_iso2"
])
indexed
=
True
except
Exception
as
error
:
print
(
error
)
pass
# save as DB child
# ----------------
document
=
corpus
.
add_child
(
...
...
@@ -96,10 +113,6 @@ def parse(corpus):
session
.
add
(
corpus
)
session
.
commit
()
# update info about the resource
resource
[
'extracted'
]
=
True
# add a corpus-level info about languages adding a __skipped__ info
...
...
install/gargamelle/debug
100644 → 100755
View file @
67cd43b0
File mode changed from 100644 to 100755
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment