Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
8d0e31fa
Commit
8d0e31fa
authored
Aug 26, 2016
by
c24b
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
LANG undeclared DETECTED at parsing => hyperdata
parent
f1476df9
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
22 additions
and
32 deletions
+22
-32
languages.py
gargantext/util/languages.py
+3
-14
parsing.py
gargantext/util/toolchain/parsing.py
+19
-18
No files found.
gargantext/util/languages.py
View file @
8d0e31fa
from
gargantext.constants
import
*
from
gargantext.constants
import
*
from
langdetect
import
detect
,
DetectorFactory
from
langdetect
import
detect
,
DetectorFactory
import
time
def
timing
(
f
):
def
wrap
(
*
args
):
time1
=
time
.
time
()
ret
=
f
(
*
args
)
time2
=
time
.
time
()
print
(
'function took
%0.3
f ms'
%
((
time2
-
time1
)
*
1000.0
))
return
ret
return
wrap
class
Language
:
class
Language
:
def
__init__
(
self
,
iso2
=
None
,
iso3
=
None
,
full_name
=
None
,
name
=
None
):
def
__init__
(
self
,
iso2
=
None
,
iso3
=
None
,
full_name
=
None
,
name
=
None
):
self
.
iso2
=
iso2
self
.
iso2
=
iso2
self
.
iso3
=
iso3
self
.
iso3
=
iso3
self
.
name
=
name
self
.
name
=
name
self
.
full_name
=
full_name
self
.
implemented
=
iso2
in
LANGUAGES
self
.
implemented
=
iso2
in
LANGUAGES
def
__str__
(
self
):
def
__str__
(
self
):
...
@@ -38,10 +27,10 @@ class Languages(dict):
...
@@ -38,10 +27,10 @@ class Languages(dict):
languages
=
Languages
()
languages
=
Languages
()
@
timing
def
detect_lang
(
text
):
def
detect_lang
(
text
):
DetectorFactory
.
seed
=
0
DetectorFactory
.
seed
=
0
return
languages
[
detect
(
text
)]
.
iso2
return
languages
[
detect
(
text
)]
import
pycountry
import
pycountry
pycountry_keys
=
(
pycountry_keys
=
(
...
...
gargantext/util/toolchain/parsing.py
View file @
8d0e31fa
...
@@ -21,7 +21,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
...
@@ -21,7 +21,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
skipped_languages
.
append
(
hyperdata
[
"language_iso2"
])
skipped_languages
.
append
(
hyperdata
[
"language_iso2"
])
return
observed_languages
,
skipped_languages
return
observed_languages
,
skipped_languages
observed_languages
.
append
(
hyperdata
[
"language_iso2"
])
observed_languages
.
append
(
hyperdata
[
"language_iso2"
])
return
observed_languages
,
skipped_languages
return
hyperdata
,
observed_languages
,
skipped_languages
elif
"language_iso3"
in
hyperdata
.
keys
():
elif
"language_iso3"
in
hyperdata
.
keys
():
...
@@ -32,33 +32,33 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
...
@@ -32,33 +32,33 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
skipped_languages
.
append
(
lang
)
skipped_languages
.
append
(
lang
)
return
observed_languages
,
skipped_languages
return
observed_languages
,
skipped_languages
observed_languages
.
append
(
lang
)
observed_languages
.
append
(
lang
)
return
observed_languages
,
skipped_languages
return
hyperdata
,
observed_languages
,
skipped_languages
except
KeyError
:
except
KeyError
:
print
(
"LANG not referenced"
,
(
hyperdata
[
"language_iso3"
]))
print
(
"LANG not referenced"
,
(
hyperdata
[
"language_iso3"
]))
skipped_languages
.
append
(
hyperdata
[
"language_iso3"
])
skipped_languages
.
append
(
hyperdata
[
"language_iso3"
])
return
observed_languages
,
skipped_languages
return
hyperdata
,
observed_languages
,
skipped_languages
elif
"language_
full
name"
in
hyperdata
.
keys
():
elif
"language_name"
in
hyperdata
.
keys
():
try
:
try
:
#convert
#convert
lang
=
languages
[
hyperdata
[
"language_
full
name"
]]
.
iso2
lang
=
languages
[
hyperdata
[
"language_name"
]]
.
iso2
if
lang
not
in
LANGUAGES
.
keys
():
if
lang
not
in
LANGUAGES
.
keys
():
skipped_languages
.
append
(
lang
)
skipped_languages
.
append
(
lang
)
return
observed_languages
,
skipped_languages
return
observed_languages
,
skipped_languages
observed_languages
.
append
(
lang
)
observed_languages
.
append
(
lang
)
return
observed_languages
,
skipped_languages
return
hyperdata
,
observed_languages
,
skipped_languages
except
KeyError
:
except
KeyError
:
print
(
"LANG Not referenced"
,
(
hyperdata
[
"language_
full
name"
]))
print
(
"LANG Not referenced"
,
(
hyperdata
[
"language_name"
]))
skipped_languages
.
append
(
hyperdata
[
"language_
full
name"
])
skipped_languages
.
append
(
hyperdata
[
"language_name"
])
return
observed_languages
,
skipped_languages
return
hyperdata
,
observed_languages
,
skipped_languages
else
:
else
:
print
(
"[WARNING] no language_* found in document [parsing.py]"
)
print
(
"[WARNING] no language_* found in document [parsing.py]"
)
if
DETECT_LANG
is
False
:
if
DETECT_LANG
is
False
:
skipped_languages
.
append
(
"__unknown__"
)
skipped_languages
.
append
(
"__unknown__"
)
return
observed_languages
,
skipped_languages
return
hyperdata
,
observed_languages
,
skipped_languages
#no language have been indexed
#no language have been indexed
#detectlang by joining on the DEFAULT_INDEX_FIELDS
#detectlang by joining on the DEFAULT_INDEX_FIELDS
...
@@ -69,16 +69,17 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
...
@@ -69,16 +69,17 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
if
len
(
text
)
<
10
:
if
len
(
text
)
<
10
:
hyperdata
[
"error"
]
=
"Error: no TEXT fields to index"
hyperdata
[
"error"
]
=
"Error: no TEXT fields to index"
skipped_languages
.
append
(
"__unknown__"
)
skipped_languages
.
append
(
"__unknown__"
)
return
observed_languages
,
skipped_languages
return
hyperdata
,
observed_languages
,
skipped_languages
else
:
else
:
#detect_lang return iso2
#detect_lang return iso2
lang
=
detect_lang
(
text
)
lang
=
detect_lang
(
text
)
if
lang
not
in
LANGUAGES
.
keys
():
for
k
in
[
"iso2"
,
"iso3"
,
"name"
]:
skipped_languages
.
append
(
lang
)
hyperdata
[
"language_"
+
k
]
=
lang
[
k
]
if
lang
.
iso2
not
in
LANGUAGES
.
keys
():
skipped_languages
.
append
(
lang
.
iso2
)
return
observed_languages
,
skipped_languages
return
observed_languages
,
skipped_languages
observed_languages
.
append
(
lang
)
observed_languages
.
append
(
lang
.
iso2
)
return
observed_languages
,
skipped_languages
return
hyperdata
,
observed_languages
,
skipped_languages
def
parse
(
corpus
):
def
parse
(
corpus
):
...
@@ -122,8 +123,8 @@ def parse(corpus):
...
@@ -122,8 +123,8 @@ def parse(corpus):
except
Exception
as
error
:
except
Exception
as
error
:
hyperdata
[
"error"
]
=
"Error normalize_chars"
hyperdata
[
"error"
]
=
"Error normalize_chars"
#adding lang into record hyperdata
#adding lang into record hyperdata
JUST if not declared
observed_languages
,
skipped_languages
=
add_lang
(
hyperdata
,
observed_languages
,
skipped_languages
)
hyperdata
,
observed_languages
,
skipped_languages
=
add_lang
(
hyperdata
,
observed_languages
,
skipped_languages
)
# save as corpus DB child
# save as corpus DB child
# ----------------
# ----------------
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment