Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
d3b5ebbf
Commit
d3b5ebbf
authored
Jul 27, 2016
by
c24b
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Corpus supported languages + skipped docs
parent
1aa7f732
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
14 additions
and
9 deletions
+14
-9
ngrams_extraction.py
gargantext/util/toolchain/ngrams_extraction.py
+14
-9
No files found.
gargantext/util/toolchain/ngrams_extraction.py
View file @
d3b5ebbf
...
...
@@ -48,12 +48,17 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
ngrams_data
=
set
()
# extract ngrams
resource
=
corpus
.
resources
()[
0
]
source
=
get_resource
(
resource
[
"type"
])
documents_count
=
0
#load available taggers for source default langage
tagger
s_bots
=
{
lang
:
load_tagger
(
lang
)
for
lang
in
re
source
[
'default_languages'
]}
tagger
_bots
=
{
lang
:
load_tagger
(
lang
)
for
lang
in
source
[
'default_languages'
]}
#skipped documents that have an unsupported languages
corpus
.
skipped_docs
=
[
doc
.
id
for
doc
in
enumerate
(
corpus
.
children
(
'DOCUMENT'
))
if
doc
.
hyperdata
[
"language_iso2"
]
not
in
resource
[
"default_languages"
]]
print
(
set
(
corpus
.
languages
.
keys
())
.
intersection
(
resource
[
"default_languages"
]))
corpus
.
skipped_docs
=
[
doc
.
id
for
doc
in
corpus
.
children
(
'DOCUMENT'
)
if
doc
.
hyperdata
[
"language_iso2"
]
not
in
source
[
"default_languages"
]]
print
(
corpus
.
hyperdata
[
"languages"
])
#add it to corpus.Language info
#diff = set(corpus.hyperdata["languages"].keys()) - set(source["default_languages"]))
#if len(diff) > 1:
# if lang_doc in corpus.hyperdata['languages']:
# skipped_lg_infos = corpus.hyperdata['languages'].pop(lang_doc)
...
...
@@ -77,7 +82,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
#
# else:
# extract ngrams on each of the considered keys
ngramextractor
=
tagger
s_bot
[
lang_doc
]
ngramextractor
=
tagger
_bots
[
lang_doc
]
for
key
in
keys
:
value
=
document
.
hyperdata
.
get
(
key
,
None
)
...
...
@@ -119,11 +124,11 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
corpus
.
status
(
'Ngrams'
,
progress
=
documents_count
+
1
,
complete
=
True
)
corpus
.
save_hyperdata
()
session
.
commit
()
except
Exception
as
error
:
corpus
.
status
(
'Ngrams'
,
error
=
error
)
corpus
.
save_hyperdata
()
session
.
commit
()
raise
error
except
Exception
as
error
:
corpus
.
status
(
'Ngrams'
,
error
=
error
)
corpus
.
save_hyperdata
()
session
.
commit
()
raise
error
def
normalize_forms
(
term_str
,
do_lowercase
=
DEFAULT_ALL_LOWERCASE_FLAG
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment