Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
8c8896bd
Commit
8c8896bd
authored
Aug 25, 2016
by
c24b
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/romain-stable-patch' into c24b-stable
parents
f5ee0376
e049772c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
106 additions
and
75 deletions
+106
-75
_Parser.py
gargantext/util/parsers/_Parser.py
+2
-1
ngram_groups.py
gargantext/util/toolchain/ngram_groups.py
+2
-2
ngrams_extraction.py
gargantext/util/toolchain/ngrams_extraction.py
+54
-21
parsing.py
gargantext/util/toolchain/parsing.py
+48
-51
No files found.
gargantext/util/parsers/_Parser.py
View file @
8c8896bd
...
@@ -122,7 +122,8 @@ class Parser:
...
@@ -122,7 +122,8 @@ class Parser:
if
language_key
in
hyperdata
:
if
language_key
in
hyperdata
:
try
:
try
:
language_symbol
=
hyperdata
[
language_key
]
language_symbol
=
hyperdata
[
language_key
]
language
=
languages
[
language_symbol
]
if
language_symbol
is
not
None
:
language
=
languages
[
language_symbol
]
if
language
:
if
language
:
break
break
except
KeyError
:
except
KeyError
:
...
...
gargantext/util/toolchain/ngram_groups.py
View file @
8c8896bd
...
@@ -26,7 +26,7 @@ def prepare_stemmers(corpus):
...
@@ -26,7 +26,7 @@ def prepare_stemmers(corpus):
and formatted
and formatted
"""
"""
stemmers
=
{
lang
:
SnowballStemmer
(
languages
[
lang
]
.
name
.
lower
())
for
lang
\
stemmers
=
{
lang
:
SnowballStemmer
(
languages
[
lang
]
.
name
.
lower
())
for
lang
\
in
corpus
.
languages
.
keys
()
if
lang
!=
"__skipped__"
}
in
corpus
.
hyperdata
[
'languages'
]
.
keys
()
if
lang
!=
"__skipped__"
}
stemmers
[
'__unknown__'
]
=
SnowballStemmer
(
"english"
)
stemmers
[
'__unknown__'
]
=
SnowballStemmer
(
"english"
)
return
stemmers
return
stemmers
...
@@ -56,7 +56,7 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
...
@@ -56,7 +56,7 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
# preloop per doc to sort ngrams by language
# preloop per doc to sort ngrams by language
for
doc
in
corpus
.
children
(
'DOCUMENT'
):
for
doc
in
corpus
.
children
(
'DOCUMENT'
):
if
doc
.
id
not
in
corpus
.
skipped_docs
:
if
doc
.
id
not
in
corpus
.
hyperdata
[
'skipped_docs'
]
:
if
(
'language_iso2'
in
doc
.
hyperdata
):
if
(
'language_iso2'
in
doc
.
hyperdata
):
lgid
=
doc
.
hyperdata
[
'language_iso2'
]
lgid
=
doc
.
hyperdata
[
'language_iso2'
]
else
:
else
:
...
...
gargantext/util/toolchain/ngrams_extraction.py
View file @
8c8896bd
...
@@ -47,29 +47,60 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
...
@@ -47,29 +47,60 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
resource
=
corpus
.
resources
()[
0
]
resource
=
corpus
.
resources
()[
0
]
documents_count
=
0
documents_count
=
0
source
=
get_resource
(
resource
[
"type"
])
source
=
get_resource
(
resource
[
"type"
])
#load available taggers for source default langage
docs
=
[
doc
for
doc
in
corpus
.
children
(
'DOCUMENT'
)
if
doc
.
id
not
in
corpus
.
skipped_docs
]
# preload available taggers for corpus languages
tagger_bots
=
{
lang
:
load_tagger
(
lang
)()
for
lang
in
corpus
.
languages
if
lang
!=
"__skipped__"
}
tagger_bots
=
{}
#sort docs by lang?
skipped_languages
=
{}
# for lang, tagger in tagger_bots.items():
for
lang
in
corpus
.
hyperdata
[
'languages'
]:
try
:
tagger_bots
[
lang
]
=
load_tagger
(
lang
)()
except
KeyError
:
skipped_languages
[
lang
]
=
True
print
(
"WARNING skipping language:"
,
lang
)
# the list of todo docs
docs
=
[
doc
for
doc
in
corpus
.
children
(
'DOCUMENT'
)
if
doc
.
id
not
in
corpus
.
hyperdata
[
'skipped_docs'
]]
# go for the loop
for
documents_count
,
document
in
enumerate
(
docs
):
for
documents_count
,
document
in
enumerate
(
docs
):
language_iso2
=
document
.
hyperdata
.
get
(
'language_iso2'
)
language_iso2
=
document
.
hyperdata
.
get
(
'language_iso2'
)
tagger
=
tagger_bots
[
language_iso2
]
#print(language_iso2)
#print(language_iso2)
# skip case if no tagger available
if
language_iso2
in
skipped_languages
:
corpus
.
hyperdata
[
'skipped_docs'
][
document
.
id
]
=
True
corpus
.
save_hyperdata
()
document
.
hyperdata
[
"error"
]
=
"Error: unsupported language"
document
.
save_hyperdata
()
session
.
commit
()
continue
# NORMAL CASE
tagger
=
tagger_bots
[
language_iso2
]
for
key
in
keys
:
for
key
in
keys
:
key
=
str
(
key
)
if
key
not
in
document
.
hyperdata
:
# print("DBG missing key in doc", key)
# TODO test if document has no keys at all
continue
# get a text value
value
=
document
[
key
]
if
not
isinstance
(
value
,
str
):
print
(
"DBG wrong content in doc for key"
,
key
)
continue
try
:
try
:
value
=
document
[
str
(
key
)]
# get ngrams
if
not
isinstance
(
value
,
str
):
ngrams
=
tagger
.
extract
(
value
)
continue
for
ngram
in
ngrams
:
# get ngrams
for
ngram
in
tagger
.
extract
(
value
):
tokens
=
tuple
(
normalize_forms
(
token
[
0
])
for
token
in
ngram
)
tokens
=
tuple
(
normalize_forms
(
token
[
0
])
for
token
in
ngram
)
if
do_subngrams
:
if
do_subngrams
:
# ex tokens = ["very", "cool", "exemple"]
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],
# subterms = [['very', 'cool'],...]
# ['very', 'cool', 'exemple'],
# ['cool', 'exemple']]
subterms
=
subsequences
(
tokens
)
subterms
=
subsequences
(
tokens
)
else
:
else
:
subterms
=
[
tokens
]
subterms
=
[
tokens
]
...
@@ -81,13 +112,11 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
...
@@ -81,13 +112,11 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
nodes_ngrams_count
[(
document
.
id
,
ngram
)]
+=
1
nodes_ngrams_count
[(
document
.
id
,
ngram
)]
+=
1
# add fields : terms n
# add fields : terms n
ngrams_data
.
add
((
ngram
[:
255
],
len
(
seqterm
),
))
ngrams_data
.
add
((
ngram
[:
255
],
len
(
seqterm
),
))
except
:
except
Exception
as
e
:
#value not in doc
print
(
'NGRAMS EXTRACTION skipping doc
%
i because of unknown error:'
%
document
.
id
,
str
(
e
))
# TODO add info to document.hyperdata['error']
pass
pass
# except AttributeError:
# print("ERROR NO language_iso2")
# document.status("NGRAMS", error="No lang detected skipped Ngrams")
# corpus.skipped_docs.append(document.id)
# integrate ngrams and nodes-ngrams
# integrate ngrams and nodes-ngrams
if
len
(
nodes_ngrams_count
)
>=
BATCH_NGRAMSEXTRACTION_SIZE
:
if
len
(
nodes_ngrams_count
)
>=
BATCH_NGRAMSEXTRACTION_SIZE
:
_integrate_associations
(
nodes_ngrams_count
,
ngrams_data
,
db
,
cursor
)
_integrate_associations
(
nodes_ngrams_count
,
ngrams_data
,
db
,
cursor
)
...
@@ -105,9 +134,13 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
...
@@ -105,9 +134,13 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
nodes_ngrams_count
.
clear
()
nodes_ngrams_count
.
clear
()
ngrams_data
.
clear
()
ngrams_data
.
clear
()
corpus
.
hyperdata
[
'skipped_languages'
]
=
skipped_languages
corpus
.
save_hyperdata
()
corpus
.
status
(
'Ngrams'
,
progress
=
documents_count
+
1
,
complete
=
True
)
corpus
.
status
(
'Ngrams'
,
progress
=
documents_count
+
1
,
complete
=
True
)
corpus
.
save_hyperdata
()
corpus
.
save_hyperdata
()
session
.
commit
()
session
.
commit
()
except
Exception
as
error
:
except
Exception
as
error
:
corpus
.
status
(
'Ngrams'
,
error
=
error
)
corpus
.
status
(
'Ngrams'
,
error
=
error
)
corpus
.
save_hyperdata
()
corpus
.
save_hyperdata
()
...
...
gargantext/util/toolchain/parsing.py
View file @
8c8896bd
...
@@ -10,31 +10,30 @@ def parse(corpus):
...
@@ -10,31 +10,30 @@ def parse(corpus):
try
:
try
:
documents_count
=
0
documents_count
=
0
corpus
.
status
(
'Docs'
,
progress
=
0
)
corpus
.
status
(
'Docs'
,
progress
=
0
)
#1 corpus => 1 resource
# shortcut to hyperdata's list of added resources (packs of docs)
resources
=
corpus
.
resources
()
resources
=
corpus
.
resources
()
#get the sources capabilities for a given corpus resource
sources
=
[
get_resource
(
resource
[
"type"
])
for
resource
in
corpus
.
resources
()
if
resource
[
"extracted"
]
is
False
]
# vars to gather some infos during parsing (=> will end up in hyperdata)
if
len
(
sources
)
==
0
:
skipped_docs
=
defaultdict
(
bool
)
#>>> documents have already been parsed?????
observed_languages
=
defaultdict
(
int
)
return
if
len
(
sources
)
>
0
:
# each resource contains a path to a file with the docs
#>>> necessairement 1 corpus = 1 source dans l'archi actuelle
for
i
,
resource
in
enumerate
(
resources
):
source
=
sources
[
0
]
resource
=
resources
[
0
]
# we'll only want the resources that have never been extracted
#source.extend(resource)
if
resource
[
"extracted"
]:
if
source
[
"parser"
]
is
None
:
continue
# the sourcetype's infos
source_infos
=
get_resource
(
resource
[
'type'
])
if
source_infos
[
"parser"
]
is
None
:
#corpus.status(error)
#corpus.status(error)
raise
ValueError
(
"Resource '
%
s' has no Parser"
%
resource
[
"name"
])
raise
ValueError
(
"Resource '
%
s' has no Parser"
%
resource
[
"name"
])
else
:
else
:
#observed langages in corpus docs
# load the corresponding parser
corpus
.
languages
=
defaultdict
.
fromkeys
(
source
[
"default_languages"
],
0
)
parserbot
=
load_parser
(
source_infos
)
#remember the skipped docs in parsing
skipped_languages
=
[]
corpus
.
skipped_docs
=
[]
session
.
add
(
corpus
)
session
.
commit
()
#load the corresponding parser
parserbot
=
load_parser
(
source
)
# extract and insert documents from resource.path into database
# extract and insert documents from resource.path into database
default_lang_field
=
[
"language_"
+
l
for
l
in
[
"iso2"
,
"iso3"
,
"full_name"
]]
default_lang_field
=
[
"language_"
+
l
for
l
in
[
"iso2"
,
"iso3"
,
"full_name"
]]
...
@@ -47,15 +46,10 @@ def parse(corpus):
...
@@ -47,15 +46,10 @@ def parse(corpus):
except
Exception
as
error
:
except
Exception
as
error
:
hyperdata
[
"error"
]
=
"Error normalize_chars"
hyperdata
[
"error"
]
=
"Error normalize_chars"
# any parserbot should implement a language_iso2
#any parser should implement a language_iso2
if
"language_iso2"
in
hyperdata
.
keys
():
if
"language_iso2"
in
hyperdata
.
keys
():
try
:
observed_languages
[
hyperdata
[
"language_iso2"
]]
+=
1
corpus
.
languages
[
hyperdata
[
"language_iso2"
]]
+=
1
except
KeyError
:
hyperdata
[
"error"
]
=
"Error: unsupported language"
skipped_languages
.
append
(
hyperdata
[
"language_iso2"
])
# this should be the responsability of the parserbot
# this should be the responsability of the parserbot
# elif "language_iso3" in hyperdata.keys():
# elif "language_iso3" in hyperdata.keys():
# try:
# try:
...
@@ -66,22 +60,15 @@ def parse(corpus):
...
@@ -66,22 +60,15 @@ def parse(corpus):
else
:
else
:
print
(
"[WARNING] no language_iso2 found in document [parsing.py]"
)
print
(
"[WARNING] no language_iso2 found in document [parsing.py]"
)
#no language have been indexed
# no language has been found by parserbot
#detectlang by index_fields
# => detectlang on index_fields
text
=
" "
.
join
([
getattr
(
hyperdata
,
k
,
''
)
for
k
in
DEFAULT_INDEX_FIELDS
])
text
=
" "
.
join
([
getattr
(
hyperdata
,
k
)
for
k
in
DEFAULT_INDEX_FIELDS
])
if
len
(
text
)
<
10
:
if
len
(
text
)
<
10
:
hyperdata
[
"error"
]
=
"Error: no TEXT fields to index"
hyperdata
[
"error"
]
=
"Error: no TEXT fields to index"
skipped_languages
.
append
(
"__unknown__"
)
else
:
predicted_lang
=
detect_lang
(
text
)
hyperdata
[
"language_iso2"
]
=
detect_lang
(
text
)
hyperdata
[
"language_iso2"
]
=
predicted_lang
try
:
observed_languages
[
predicted_lang
]
+=
1
corpus
.
languages
[
hyperdata
[
"language_iso2"
]]
+=
1
corpus
.
languages
[
hyperdata
[
"language_iso2"
]]
+=
1
except
KeyError
:
hyperdata
[
"error"
]
=
"Error: unsupported language"
skipped_languages
.
append
(
hyperdata
[
"language_iso2"
])
# save as DB child
# save as DB child
# ----------------
# ----------------
...
@@ -97,8 +84,10 @@ def parse(corpus):
...
@@ -97,8 +84,10 @@ def parse(corpus):
document
.
status
(
'Parsing'
,
error
=
document
.
hyperdata
[
"error"
])
document
.
status
(
'Parsing'
,
error
=
document
.
hyperdata
[
"error"
])
document
.
save_hyperdata
()
document
.
save_hyperdata
()
session
.
commit
()
session
.
commit
()
#adding skipped_docs for later processsing
corpus
.
skipped_docs
.
append
(
document
.
id
)
# adding to skipped_docs for later processing
skipped_docs
[
document
.
id
]
=
True
documents_count
+=
1
documents_count
+=
1
# logging
# logging
...
@@ -109,19 +98,27 @@ def parse(corpus):
...
@@ -109,19 +98,27 @@ def parse(corpus):
session
.
commit
()
session
.
commit
()
# update info about the resource
# update info about the resource
resource
[
'extracted'
]
=
True
corpus
.
hyperdata
[
'resources'
][
i
][
'extracted'
]
=
True
# add a corpus-level info about languages adding a __skipped__ info
corpus
.
save_hyperdata
()
corpus
.
languages
[
'__skipped__'
]
=
Counter
(
skipped_languages
)
session
.
commit
()
print
(
"PARSING:"
,
len
(
skipped_docs
),
"docs skipped"
)
print
(
"LANGUES"
)
print
(
"LANGUES"
)
for
n
in
corpus
.
languages
.
items
():
for
n
in
observed_
languages
.
items
():
print
(
n
)
print
(
n
)
#TO DO: give the main language of the corpus to unsupported lang docs
print
(
len
(
corpus
.
skipped_docs
),
"docs skipped"
)
# add the infos to hyperdata at the end
corpus
.
hyperdata
[
'skipped_docs'
]
=
skipped_docs
corpus
.
hyperdata
[
'languages'
]
=
observed_languages
corpus
.
save_hyperdata
()
# commit all changes
# commit all changes
corpus
.
status
(
'Docs'
,
progress
=
documents_count
,
complete
=
True
)
corpus
.
status
(
'Docs'
,
progress
=
documents_count
,
complete
=
True
)
corpus
.
save_hyperdata
()
corpus
.
save_hyperdata
()
session
.
add
(
corpus
)
session
.
add
(
corpus
)
session
.
commit
()
session
.
commit
()
except
Exception
as
error
:
except
Exception
as
error
:
corpus
.
status
(
'Docs'
,
error
=
error
)
corpus
.
status
(
'Docs'
,
error
=
error
)
corpus
.
save_hyperdata
()
corpus
.
save_hyperdata
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment