Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
9e7284d2
Commit
9e7284d2
authored
Aug 26, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'c24b-stable' into romain-stable-patch2
parents
b1fce79d
22a96c99
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
54 additions
and
41 deletions
+54
-41
constants.py
gargantext/constants.py
+1
-1
languages.py
gargantext/util/languages.py
+3
-14
ngrams_extraction.py
gargantext/util/toolchain/ngrams_extraction.py
+1
-0
parsing.py
gargantext/util/toolchain/parsing.py
+26
-22
project.html
templates/pages/projects/project.html
+23
-4
No files found.
gargantext/constants.py
View file @
9e7284d2
...
...
@@ -313,7 +313,7 @@ DEFAULT_INDEX_SUBGRAMS = False # False <=> traditional
# at indexing after extraction)
# TAGGING options -----------------------------------------
#activate lang detection?
DETECT_LANG
=
Tru
e
DETECT_LANG
=
Fals
e
# Defaults INDEXED Fields for ngrams extraction
# put longest field first in order to make detection language more efficient
DEFAULT_INDEX_FIELDS
=
(
'abstract'
,
'title'
)
...
...
gargantext/util/languages.py
View file @
9e7284d2
from
gargantext.constants
import
*
from
langdetect
import
detect
,
DetectorFactory
import
time
def
timing
(
f
):
def
wrap
(
*
args
):
time1
=
time
.
time
()
ret
=
f
(
*
args
)
time2
=
time
.
time
()
print
(
'function took
%0.3
f ms'
%
((
time2
-
time1
)
*
1000.0
))
return
ret
return
wrap
class
Language
:
def
__init__
(
self
,
iso2
=
None
,
iso3
=
None
,
full_name
=
None
,
name
=
None
):
self
.
iso2
=
iso2
self
.
iso3
=
iso3
self
.
name
=
name
self
.
full_name
=
full_name
self
.
implemented
=
iso2
in
LANGUAGES
def
__str__
(
self
):
...
...
@@ -38,10 +27,10 @@ class Languages(dict):
languages
=
Languages
()
@
timing
def
detect_lang
(
text
):
DetectorFactory
.
seed
=
0
return
languages
[
detect
(
text
)]
.
iso2
return
languages
[
detect
(
text
)]
import
pycountry
pycountry_keys
=
(
...
...
gargantext/util/toolchain/ngrams_extraction.py
View file @
9e7284d2
...
...
@@ -53,6 +53,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
#print(LANGUAGES.keys())
tagger_bots
=
{
lang
:
load_tagger
(
lang
)
for
lang
in
corpus
.
hyperdata
[
"languages"
]
\
if
lang
!=
"__unknown__"
}
tagger_bots
[
"__unknown__"
]
=
load_tagger
(
"en"
)
print
(
"#TAGGERS LOADED: "
,
tagger_bots
)
supported_taggers_lang
=
tagger_bots
.
keys
()
print
(
"#SUPPORTED TAGGER LANGS"
,
supported_taggers_lang
)
...
...
gargantext/util/toolchain/parsing.py
View file @
9e7284d2
...
...
@@ -21,7 +21,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
skipped_languages
.
append
(
hyperdata
[
"language_iso2"
])
return
observed_languages
,
skipped_languages
observed_languages
.
append
(
hyperdata
[
"language_iso2"
])
return
observed_languages
,
skipped_languages
return
hyperdata
,
observed_languages
,
skipped_languages
elif
"language_iso3"
in
hyperdata
.
keys
():
...
...
@@ -32,53 +32,57 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
skipped_languages
.
append
(
lang
)
return
observed_languages
,
skipped_languages
observed_languages
.
append
(
lang
)
return
observed_languages
,
skipped_languages
return
hyperdata
,
observed_languages
,
skipped_languages
except
KeyError
:
print
(
"LANG not referenced"
,
(
hyperdata
[
"language_iso3"
]))
skipped_languages
.
append
(
hyperdata
[
"language_iso3"
])
return
observed_languages
,
skipped_languages
return
hyperdata
,
observed_languages
,
skipped_languages
elif
"language_
full
name"
in
hyperdata
.
keys
():
elif
"language_name"
in
hyperdata
.
keys
():
try
:
#convert
lang
=
languages
[
hyperdata
[
"language_
full
name"
]]
.
iso2
lang
=
languages
[
hyperdata
[
"language_name"
]]
.
iso2
if
lang
not
in
LANGUAGES
.
keys
():
skipped_languages
.
append
(
lang
)
return
observed_languages
,
skipped_languages
return
hyperdata
,
observed_languages
,
skipped_languages
observed_languages
.
append
(
lang
)
return
observed_languages
,
skipped_languages
return
hyperdata
,
observed_languages
,
skipped_languages
except
KeyError
:
print
(
"LANG Not referenced"
,
(
hyperdata
[
"language_
full
name"
]))
skipped_languages
.
append
(
hyperdata
[
"language_
full
name"
])
return
observed_languages
,
skipped_languages
print
(
"LANG Not referenced"
,
(
hyperdata
[
"language_name"
]))
skipped_languages
.
append
(
hyperdata
[
"language_name"
])
return
hyperdata
,
observed_languages
,
skipped_languages
else
:
print
(
"[WARNING] no language_* found in document [parsing.py]"
)
if
DETECT_LANG
is
False
:
skipped_languages
.
append
(
"__unknown__"
)
return
observed_languages
,
skipped_languages
#skipped_languages.append("__unknown__")
hyperdata
[
"language_iso2"
]
=
"__unknown__"
return
hyperdata
,
observed_languages
,
skipped_languages
#no language have been indexed
#detectlang by joining on the DEFAULT_INDEX_FIELDS
text_fields2
=
list
(
set
(
DEFAULT_INDEX_FIELDS
)
&
set
(
hyperdata
.
keys
()))
print
(
len
(
text_fields2
))
if
len
(
text_fields2
)
<
2
:
print
(
"[WARNING] missing
%
s key"
%
text_fields
)
text
=
" "
.
join
([
hyperdata
[
k
]
for
k
in
text_fields2
])
if
len
(
text
)
<
10
:
hyperdata
[
"error"
]
=
"Error: no TEXT fields to index"
skipped_languages
.
append
(
"__unknown__"
)
return
observed_languages
,
skipped_languages
return
hyperdata
,
observed_languages
,
skipped_languages
else
:
#detect_lang return iso2
lang
=
detect_lang
(
text
)
if
lang
not
in
LANGUAGES
.
keys
():
skipped_languages
.
append
(
lang
)
return
observed_languages
,
skipped_languages
observed_languages
.
append
(
lang
)
return
observed_languages
,
skipped_languages
for
k
in
[
"iso2"
,
"iso3"
,
"name"
]:
hyperdata
[
"language_"
+
k
]
=
getattr
(
lang
,
k
)
if
lang
.
iso2
not
in
LANGUAGES
.
keys
():
#hyperdata["language_iso2"] = "__unknown__"
skipped_languages
.
append
(
lang
.
iso2
)
return
hyperdata
,
observed_languages
,
skipped_languages
observed_languages
.
append
(
lang
.
iso2
)
return
hyperdata
,
observed_languages
,
skipped_languages
def
parse
(
corpus
):
...
...
@@ -122,8 +126,8 @@ def parse(corpus):
except
Exception
as
error
:
hyperdata
[
"error"
]
=
"Error normalize_chars"
#adding lang into record hyperdata
observed_languages
,
skipped_languages
=
add_lang
(
hyperdata
,
observed_languages
,
skipped_languages
)
#adding lang into record hyperdata
JUST if not declared
hyperdata
,
observed_languages
,
skipped_languages
=
add_lang
(
hyperdata
,
observed_languages
,
skipped_languages
)
# save as corpus DB child
# ----------------
...
...
templates/pages/projects/project.html
View file @
9e7284d2
...
...
@@ -24,6 +24,8 @@
{% block content %}
<div
class=
"container theme-showcase"
role=
"main"
>
<div
class=
"jumbotron"
>
<div
class=
"row"
>
...
...
@@ -35,6 +37,7 @@
<!--<h3> {{number}} corpora </h3>-->
{% endif %}
</div>
<div
class=
"col-md-4"
>
<p>
{% if donut %}
...
...
@@ -68,6 +71,16 @@
<div
class=
"container"
>
<!-- Modal -->
<div
id=
"wait"
class=
"modal row col-md-6"
>
<div
class=
"modal-dialog "
>
<h2>
Your file has been uploaded !
</h2>
<h2>
Gargantext need some time to eat it.
</h2>
<h2>
Duration depends on the size of the dish.
</h2>
<a
class=
"btn btn-primary btn-lg"
href=
"/projects/{{ project.id }}"
title=
"Click and test by yourself"
>
Continue on Gargantext
</a>
</div>
</div>
{% if list_corpora %}
{% for key, corpora in list_corpora.items %}
...
...
@@ -184,7 +197,9 @@
{% endif %}
<!-- Modal -->
<div
class=
"modal fade"
id=
"stack1"
tabindex=
"-1"
role=
"dialog"
aria-labelledby=
"myModalLabel"
aria-hidden=
"true"
>
<div
class=
"modal-dialog"
>
<div
class=
"modal-content"
>
...
...
@@ -318,7 +333,9 @@
// console.log(data)
setTimeout
(
function
()
{
location
.
reload
();
$
(
'#addcorpus'
).
modal
(
'hide'
);
$
(
"#wait"
).
modal
(
"show"
);
},
3000
);
},
error
:
function
(
result
)
{
...
...
@@ -563,9 +580,11 @@
// console.log(data)
setTimeout
(
function
()
{
$
(
'#addcorpus'
).
modal
(
'hide'
);
$
(
"#wait"
).
modal
(
"show"
);
location
.
reload
();
},
5
000
);
//
location.reload();
},
3
000
);
},
error
:
function
(
result
)
{
console
.
log
(
"in testISTEX(). Data not found"
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment