Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
ed967608
Commit
ed967608
authored
Jul 28, 2016
by
c24b
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
LANG DETECTION IN PARSING
parent
cb34ae15
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
56 additions
and
33 deletions
+56
-33
constants.py
gargantext/constants.py
+3
-1
requirements.pip
gargantext/requirements.pip
+1
-0
languages.py
gargantext/util/languages.py
+9
-2
parsing.py
gargantext/util/toolchain/parsing.py
+43
-30
debug
install/gargamelle/debug
+0
-0
No files found.
gargantext/constants.py
View file @
ed967608
...
@@ -315,7 +315,9 @@ DEFAULT_INDEX_SUBGRAMS = False # False <=> traditional
...
@@ -315,7 +315,9 @@ DEFAULT_INDEX_SUBGRAMS = False # False <=> traditional
# "cool example".
# "cool example".
# (all 1 to n-1 length ngrams,
# (all 1 to n-1 length ngrams,
# at indexing after extraction)
# at indexing after extraction)
DEFAULT_INDEX_FIELDS
=
(
'title'
,
'abstract'
,
)
#Defaults Fields for ngrams extraction
# Defaults INDEXED Fields for ngrams extraction
# put longest field first in order to make detection language more efficient
DEFAULT_INDEX_FIELDS
=
(
'abstract'
,
'title'
)
# Grammar rules for chunking
# Grammar rules for chunking
RULE_JJNN
=
"{<JJ.*>*<NN.*|>+<JJ.*>*}"
RULE_JJNN
=
"{<JJ.*>*<NN.*|>+<JJ.*>*}"
RULE_JJDTNN
=
"{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}"
RULE_JJDTNN
=
"{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}"
...
...
gargantext/requirements.pip
View file @
ed967608
...
@@ -14,6 +14,7 @@ djangorestframework==3.3.2
...
@@ -14,6 +14,7 @@ djangorestframework==3.3.2
html5lib==0.9999999
html5lib==0.9999999
jdatetime==1.7.2
jdatetime==1.7.2
kombu==3.0.33
kombu==3.0.33
langdetect==1.0.6
lxml==3.5.0
lxml==3.5.0
networkx==1.11
networkx==1.11
nltk==3.1
nltk==3.1
...
...
gargantext/util/languages.py
View file @
ed967608
from
gargantext.constants
import
*
from
gargantext.constants
import
*
from
langdetect
import
detect
from
langdetect
import
DetectorFactory
class
Language
:
class
Language
:
def
__init__
(
self
,
iso2
=
None
,
iso3
=
None
,
name
=
None
):
def
__init__
(
self
,
iso2
=
None
,
iso3
=
None
,
full_name
=
None
,
name
=
None
):
self
.
iso2
=
iso2
self
.
iso2
=
iso2
self
.
iso3
=
iso3
self
.
iso3
=
iso3
self
.
name
=
name
self
.
name
=
name
self
.
implemented
=
iso2
in
LANGUAGES
self
.
implemented
=
iso2
in
LANGUAGES
def
__str__
(
self
):
def
__str__
(
self
):
result
=
'<Language'
result
=
'<Language'
for
key
,
value
in
self
.
__dict__
.
items
():
for
key
,
value
in
self
.
__dict__
.
items
():
...
@@ -16,6 +18,10 @@ class Language:
...
@@ -16,6 +18,10 @@ class Language:
return
result
return
result
__repr__
=
__str__
__repr__
=
__str__
def
detect_lang
(
self
,
text
):
DetectorFactory
.
seed
=
0
return
Languages
[
detect
(
text
)]
.
iso2
class
Languages
(
dict
):
class
Languages
(
dict
):
def
__missing__
(
self
,
key
):
def
__missing__
(
self
,
key
):
key
=
key
.
lower
()
key
=
key
.
lower
()
...
@@ -49,3 +55,4 @@ languages['fre'] = languages['fr']
...
@@ -49,3 +55,4 @@ languages['fre'] = languages['fr']
languages
[
'ger'
]
=
languages
[
'de'
]
languages
[
'ger'
]
=
languages
[
'de'
]
languages
[
'Français'
]
=
languages
[
'fr'
]
languages
[
'Français'
]
=
languages
[
'fr'
]
languages
[
'en_US'
]
=
languages
[
'en'
]
languages
[
'en_US'
]
=
languages
[
'en'
]
languages
[
'english'
]
=
languages
[
'en'
]
gargantext/util/toolchain/parsing.py
View file @
ed967608
...
@@ -4,6 +4,7 @@ from gargantext.constants import *
...
@@ -4,6 +4,7 @@ from gargantext.constants import *
#from gargantext.util.parsers import *
#from gargantext.util.parsers import *
from
collections
import
defaultdict
,
Counter
from
collections
import
defaultdict
,
Counter
from
re
import
sub
from
re
import
sub
from
gargantext.util.languages
import
languages
,
detect_lang
def
parse
(
corpus
):
def
parse
(
corpus
):
try
:
try
:
...
@@ -27,8 +28,8 @@ def parse(corpus):
...
@@ -27,8 +28,8 @@ def parse(corpus):
else
:
else
:
#observed langages in corpus docs
#observed langages in corpus docs
corpus
.
languages
=
defaultdict
.
fromkeys
(
source
[
"default_languages"
],
0
)
corpus
.
languages
=
defaultdict
.
fromkeys
(
source
[
"default_languages"
],
0
)
skipped_languages
=
[]
#remember the skipped docs in parsing
#remember the skipped docs in parsing
skipped_languages
=
[]
corpus
.
skipped_docs
=
[]
corpus
.
skipped_docs
=
[]
session
.
add
(
corpus
)
session
.
add
(
corpus
)
session
.
commit
()
session
.
commit
()
...
@@ -43,34 +44,50 @@ def parse(corpus):
...
@@ -43,34 +44,50 @@ def parse(corpus):
hyperdata
[
k
]
=
normalize_chars
(
hyperdata
[
k
])
hyperdata
[
k
]
=
normalize_chars
(
hyperdata
[
k
])
except
Exception
as
error
:
except
Exception
as
error
:
hyperdata
[
"error"
]
=
"Error normalize_chars"
hyperdata
[
"error"
]
=
"Error normalize_chars"
indexed
=
False
# a simple census to raise language info at corpus level
# a simple census to raise language info at corpus level
if
"language_iso2"
in
hyperdata
.
keys
():
for
l
in
[
"iso2"
,
"iso3"
,
"full_name"
]:
if
hyperdata
[
"indexed"
]
is
True
:
break
lang_field
=
"language_"
+
l
if
lang_field
in
hyperdata
.
keys
():
if
l
==
"iso2"
:
try
:
try
:
corpus
.
languages
[
hyperdata
[
"language_iso2"
]]
+=
1
corpus
.
languages
[
hyperdata
[
"language_iso2"
]]
+=
1
indexed
=
True
except
KeyError
:
except
KeyError
:
print
(
"KeyError"
,
hyperdata
[
"language_iso2"
])
hyperdata
[
"error"
]
=
"Error: unsupported language"
hyperdata
[
"error"
]
=
"Error: unsupported language"
skipped_languages
.
append
(
hyperdata
[
"language_iso2"
])
skipped_languages
.
append
(
hyperdata
[
"language_iso2"
])
elif
"language_fullname"
in
hyperdata
.
keys
():
else
:
try
:
try
:
#full => iso2
lang
=
languages
(
hyperdata
[
lang_field
]
.
lower
())
.
iso2
lang
=
languages
[
hyperdata
[
"language_fullname"
]]
.
name
.
lower
()
corpus
.
languages
[
lang
]
+=
1
corpus
.
languages
[
lang
]
+=
1
indexed
=
True
except
KeyError
:
except
KeyError
:
print
(
"KeyError"
,
hyperdata
[
"language_fullname"
])
hyperdata
[
"error"
]
=
"Error: unsupported language"
hyperdata
[
"error"
]
=
"Error: unsupported language"
skipped_languages
.
append
(
lang
)
skipped_languages
.
append
(
lang
)
else
:
if
indexed
is
False
:
pass
#no language have been indexed
#no language have been indexed
#detectlang by index_fields
#detectlang by index_fields
# for k in DEFAULT_INDEX_FIELDS:
for
k
in
DEFAULT_INDEX_FIELDS
:
# if k in hyperdata.keys():
if
indexed
is
True
:
# try:
break
# hyperdata["language_iso2"] = langdetect(hyperdata[k])
if
k
in
hyperdata
.
keys
():
# except Exception as error :
try
:
# pass
hyperdata
[
"language_iso2"
]
=
detect_lang
(
hyperdata
[
k
])
#print(hyperdata.keys())
corpus
.
languages
[
lang
]
+=
1
indexed
=
True
break
except
KeyError
:
hyperdata
[
"error"
]
=
"Error: unsupported language"
skipped_languages
.
append
(
hyperdata
[
"language_iso2"
])
indexed
=
True
except
Exception
as
error
:
print
(
error
)
pass
# save as DB child
# save as DB child
# ----------------
# ----------------
document
=
corpus
.
add_child
(
document
=
corpus
.
add_child
(
...
@@ -96,10 +113,6 @@ def parse(corpus):
...
@@ -96,10 +113,6 @@ def parse(corpus):
session
.
add
(
corpus
)
session
.
add
(
corpus
)
session
.
commit
()
session
.
commit
()
# update info about the resource
# update info about the resource
resource
[
'extracted'
]
=
True
resource
[
'extracted'
]
=
True
# add a corpus-level info about languages adding a __skipped__ info
# add a corpus-level info about languages adding a __skipped__ info
...
...
install/gargamelle/debug
100644 → 100755
View file @
ed967608
File mode changed from 100644 to 100755
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment