Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
db1b31a2
Commit
db1b31a2
authored
Aug 25, 2016
by
c24b
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/romain-stable-patch' into c24b-stable
parents
7ed3dc0b
570c9fd8
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
106 additions
and
75 deletions
+106
-75
_Parser.py
gargantext/util/parsers/_Parser.py
+2
-1
ngram_groups.py
gargantext/util/toolchain/ngram_groups.py
+2
-2
ngrams_extraction.py
gargantext/util/toolchain/ngrams_extraction.py
+54
-21
parsing.py
gargantext/util/toolchain/parsing.py
+48
-51
No files found.
gargantext/util/parsers/_Parser.py
View file @
db1b31a2
...
...
@@ -122,6 +122,7 @@ class Parser:
if
language_key
in
hyperdata
:
try
:
language_symbol
=
hyperdata
[
language_key
]
if
language_symbol
is
not
None
:
language
=
languages
[
language_symbol
]
if
language
:
break
...
...
gargantext/util/toolchain/ngram_groups.py
View file @
db1b31a2
...
...
@@ -26,7 +26,7 @@ def prepare_stemmers(corpus):
and formatted
"""
stemmers
=
{
lang
:
SnowballStemmer
(
languages
[
lang
]
.
name
.
lower
())
for
lang
\
in
corpus
.
languages
.
keys
()
if
lang
!=
"__skipped__"
}
in
corpus
.
hyperdata
[
'languages'
]
.
keys
()
if
lang
!=
"__skipped__"
}
stemmers
[
'__unknown__'
]
=
SnowballStemmer
(
"english"
)
return
stemmers
...
...
@@ -56,7 +56,7 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
# preloop per doc to sort ngrams by language
for
doc
in
corpus
.
children
(
'DOCUMENT'
):
if
doc
.
id
not
in
corpus
.
skipped_docs
:
if
doc
.
id
not
in
corpus
.
hyperdata
[
'skipped_docs'
]
:
if
(
'language_iso2'
in
doc
.
hyperdata
):
lgid
=
doc
.
hyperdata
[
'language_iso2'
]
else
:
...
...
gargantext/util/toolchain/ngrams_extraction.py
View file @
db1b31a2
...
...
@@ -47,29 +47,60 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
resource
=
corpus
.
resources
()[
0
]
documents_count
=
0
source
=
get_resource
(
resource
[
"type"
])
#load available taggers for source default langage
docs
=
[
doc
for
doc
in
corpus
.
children
(
'DOCUMENT'
)
if
doc
.
id
not
in
corpus
.
skipped_docs
]
tagger_bots
=
{
lang
:
load_tagger
(
lang
)()
for
lang
in
corpus
.
languages
if
lang
!=
"__skipped__"
}
#sort docs by lang?
# for lang, tagger in tagger_bots.items():
# preload available taggers for corpus languages
tagger_bots
=
{}
skipped_languages
=
{}
for
lang
in
corpus
.
hyperdata
[
'languages'
]:
try
:
tagger_bots
[
lang
]
=
load_tagger
(
lang
)()
except
KeyError
:
skipped_languages
[
lang
]
=
True
print
(
"WARNING skipping language:"
,
lang
)
# the list of todo docs
docs
=
[
doc
for
doc
in
corpus
.
children
(
'DOCUMENT'
)
if
doc
.
id
not
in
corpus
.
hyperdata
[
'skipped_docs'
]]
# go for the loop
for
documents_count
,
document
in
enumerate
(
docs
):
language_iso2
=
document
.
hyperdata
.
get
(
'language_iso2'
)
tagger
=
tagger_bots
[
language_iso2
]
#print(language_iso2)
# skip case if no tagger available
if
language_iso2
in
skipped_languages
:
corpus
.
hyperdata
[
'skipped_docs'
][
document
.
id
]
=
True
corpus
.
save_hyperdata
()
document
.
hyperdata
[
"error"
]
=
"Error: unsupported language"
document
.
save_hyperdata
()
session
.
commit
()
continue
# NORMAL CASE
tagger
=
tagger_bots
[
language_iso2
]
for
key
in
keys
:
try
:
value
=
document
[
str
(
key
)]
key
=
str
(
key
)
if
key
not
in
document
.
hyperdata
:
# print("DBG missing key in doc", key)
# TODO test if document has no keys at all
continue
# get a text value
value
=
document
[
key
]
if
not
isinstance
(
value
,
str
):
print
(
"DBG wrong content in doc for key"
,
key
)
continue
try
:
# get ngrams
for
ngram
in
tagger
.
extract
(
value
):
ngrams
=
tagger
.
extract
(
value
)
for
ngram
in
ngrams
:
tokens
=
tuple
(
normalize_forms
(
token
[
0
])
for
token
in
ngram
)
if
do_subngrams
:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],
# ['very', 'cool', 'exemple'],
# ['cool', 'exemple']]
# subterms = [['very', 'cool'],...]
subterms
=
subsequences
(
tokens
)
else
:
subterms
=
[
tokens
]
...
...
@@ -81,13 +112,11 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
nodes_ngrams_count
[(
document
.
id
,
ngram
)]
+=
1
# add fields : terms n
ngrams_data
.
add
((
ngram
[:
255
],
len
(
seqterm
),
))
except
:
#value not in doc
except
Exception
as
e
:
print
(
'NGRAMS EXTRACTION skipping doc
%
i because of unknown error:'
%
document
.
id
,
str
(
e
))
# TODO add info to document.hyperdata['error']
pass
# except AttributeError:
# print("ERROR NO language_iso2")
# document.status("NGRAMS", error="No lang detected skipped Ngrams")
# corpus.skipped_docs.append(document.id)
# integrate ngrams and nodes-ngrams
if
len
(
nodes_ngrams_count
)
>=
BATCH_NGRAMSEXTRACTION_SIZE
:
_integrate_associations
(
nodes_ngrams_count
,
ngrams_data
,
db
,
cursor
)
...
...
@@ -105,9 +134,13 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
nodes_ngrams_count
.
clear
()
ngrams_data
.
clear
()
corpus
.
hyperdata
[
'skipped_languages'
]
=
skipped_languages
corpus
.
save_hyperdata
()
corpus
.
status
(
'Ngrams'
,
progress
=
documents_count
+
1
,
complete
=
True
)
corpus
.
save_hyperdata
()
session
.
commit
()
except
Exception
as
error
:
corpus
.
status
(
'Ngrams'
,
error
=
error
)
corpus
.
save_hyperdata
()
...
...
gargantext/util/toolchain/parsing.py
View file @
db1b31a2
...
...
@@ -10,31 +10,30 @@ def parse(corpus):
try
:
documents_count
=
0
corpus
.
status
(
'Docs'
,
progress
=
0
)
#1 corpus => 1 resource
# shortcut to hyperdata's list of added resources (packs of docs)
resources
=
corpus
.
resources
()
#get the sources capabilities for a given corpus resource
sources
=
[
get_resource
(
resource
[
"type"
])
for
resource
in
corpus
.
resources
()
if
resource
[
"extracted"
]
is
False
]
if
len
(
sources
)
==
0
:
#>>> documents have already been parsed?????
return
if
len
(
sources
)
>
0
:
#>>> necessairement 1 corpus = 1 source dans l'archi actuelle
source
=
sources
[
0
]
resource
=
resources
[
0
]
#source.extend(resource)
if
source
[
"parser"
]
is
None
:
# vars to gather some infos during parsing (=> will end up in hyperdata)
skipped_docs
=
defaultdict
(
bool
)
observed_languages
=
defaultdict
(
int
)
# each resource contains a path to a file with the docs
for
i
,
resource
in
enumerate
(
resources
):
# we'll only want the resources that have never been extracted
if
resource
[
"extracted"
]:
continue
# the sourcetype's infos
source_infos
=
get_resource
(
resource
[
'type'
])
if
source_infos
[
"parser"
]
is
None
:
#corpus.status(error)
raise
ValueError
(
"Resource '
%
s' has no Parser"
%
resource
[
"name"
])
else
:
#observed langages in corpus docs
corpus
.
languages
=
defaultdict
.
fromkeys
(
source
[
"default_languages"
],
0
)
#remember the skipped docs in parsing
skipped_languages
=
[]
corpus
.
skipped_docs
=
[]
session
.
add
(
corpus
)
session
.
commit
()
#load the corresponding parser
parserbot
=
load_parser
(
source
)
# load the corresponding parser
parserbot
=
load_parser
(
source_infos
)
# extract and insert documents from resource.path into database
default_lang_field
=
[
"language_"
+
l
for
l
in
[
"iso2"
,
"iso3"
,
"full_name"
]]
...
...
@@ -47,15 +46,10 @@ def parse(corpus):
except
Exception
as
error
:
hyperdata
[
"error"
]
=
"Error normalize_chars"
#any parser should implement a language_iso2
# any parserbot should implement a language_iso2
if
"language_iso2"
in
hyperdata
.
keys
():
try
:
corpus
.
languages
[
hyperdata
[
"language_iso2"
]]
+=
1
except
KeyError
:
hyperdata
[
"error"
]
=
"Error: unsupported language"
skipped_languages
.
append
(
hyperdata
[
"language_iso2"
])
observed_languages
[
hyperdata
[
"language_iso2"
]]
+=
1
# this should be the responsability of the parserbot
# elif "language_iso3" in hyperdata.keys():
# try:
...
...
@@ -66,22 +60,15 @@ def parse(corpus):
else
:
print
(
"[WARNING] no language_iso2 found in document [parsing.py]"
)
#no language have been indexed
#detectlang by index_fields
text
=
" "
.
join
([
getattr
(
hyperdata
,
k
)
for
k
in
DEFAULT_INDEX_FIELDS
])
# no language has been found by parserbot
# => detectlang on index_fields
text
=
" "
.
join
([
getattr
(
hyperdata
,
k
,
''
)
for
k
in
DEFAULT_INDEX_FIELDS
])
if
len
(
text
)
<
10
:
hyperdata
[
"error"
]
=
"Error: no TEXT fields to index"
skipped_languages
.
append
(
"__unknown__"
)
hyperdata
[
"language_iso2"
]
=
detect_lang
(
text
)
try
:
corpus
.
languages
[
hyperdata
[
"language_iso2"
]]
+=
1
corpus
.
languages
[
hyperdata
[
"language_iso2"
]]
+=
1
except
KeyError
:
hyperdata
[
"error"
]
=
"Error: unsupported language"
skipped_languages
.
append
(
hyperdata
[
"language_iso2"
])
else
:
predicted_lang
=
detect_lang
(
text
)
hyperdata
[
"language_iso2"
]
=
predicted_lang
observed_languages
[
predicted_lang
]
+=
1
# save as DB child
# ----------------
...
...
@@ -97,8 +84,10 @@ def parse(corpus):
document
.
status
(
'Parsing'
,
error
=
document
.
hyperdata
[
"error"
])
document
.
save_hyperdata
()
session
.
commit
()
#adding skipped_docs for later processsing
corpus
.
skipped_docs
.
append
(
document
.
id
)
# adding to skipped_docs for later processing
skipped_docs
[
document
.
id
]
=
True
documents_count
+=
1
# logging
...
...
@@ -109,19 +98,27 @@ def parse(corpus):
session
.
commit
()
# update info about the resource
resource
[
'extracted'
]
=
True
# add a corpus-level info about languages adding a __skipped__ info
corpus
.
languages
[
'__skipped__'
]
=
Counter
(
skipped_languages
)
corpus
.
hyperdata
[
'resources'
][
i
][
'extracted'
]
=
True
corpus
.
save_hyperdata
()
session
.
commit
()
print
(
"PARSING:"
,
len
(
skipped_docs
),
"docs skipped"
)
print
(
"LANGUES"
)
for
n
in
corpus
.
languages
.
items
():
for
n
in
observed_
languages
.
items
():
print
(
n
)
#TO DO: give the main language of the corpus to unsupported lang docs
print
(
len
(
corpus
.
skipped_docs
),
"docs skipped"
)
# add the infos to hyperdata at the end
corpus
.
hyperdata
[
'skipped_docs'
]
=
skipped_docs
corpus
.
hyperdata
[
'languages'
]
=
observed_languages
corpus
.
save_hyperdata
()
# commit all changes
corpus
.
status
(
'Docs'
,
progress
=
documents_count
,
complete
=
True
)
corpus
.
save_hyperdata
()
session
.
add
(
corpus
)
session
.
commit
()
except
Exception
as
error
:
corpus
.
status
(
'Docs'
,
error
=
error
)
corpus
.
save_hyperdata
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment