Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
e8d5e001
Commit
e8d5e001
authored
Sep 05, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
parsing+extraction: removing old debug messages
parent
5ce424f9
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
14 additions
and
13 deletions
+14
-13
EUROPRESSE.py
gargantext/util/parsers/EUROPRESSE.py
+0
-2
ISTEX.py
gargantext/util/parsers/ISTEX.py
+0
-1
_Tagger.py
gargantext/util/taggers/_Tagger.py
+1
-1
ngrams_extraction.py
gargantext/util/toolchain/ngrams_extraction.py
+3
-2
parsing.py
gargantext/util/toolchain/parsing.py
+10
-7
No files found.
gargantext/util/parsers/EUROPRESSE.py
View file @
e8d5e001
...
@@ -115,8 +115,6 @@ class EuropresseParser(Parser):
...
@@ -115,8 +115,6 @@ class EuropresseParser(Parser):
# parse all the articles, one by one
# parse all the articles, one by one
for
html_article
in
html_articles
:
for
html_article
in
html_articles
:
try
:
try
:
print
(
"==============================new article"
)
# s'il n'y a pas du tout de header on doit skip
# s'il n'y a pas du tout de header on doit skip
all_header
=
html_article
.
xpath
(
entire_header_xpath
)
all_header
=
html_article
.
xpath
(
entire_header_xpath
)
all_header_text
=
" "
.
join
(
scrap_text
(
all_header
))
all_header_text
=
" "
.
join
(
scrap_text
(
all_header
))
...
...
gargantext/util/parsers/ISTEX.py
View file @
e8d5e001
...
@@ -27,7 +27,6 @@ class ISTexParser(Parser):
...
@@ -27,7 +27,6 @@ class ISTexParser(Parser):
}
}
suma
=
0
suma
=
0
print
(
len
(
json_docs
))
for
json_doc
in
json_docs
:
for
json_doc
in
json_docs
:
hyperdata
=
{}
hyperdata
=
{}
...
...
gargantext/util/taggers/_Tagger.py
View file @
e8d5e001
...
@@ -32,7 +32,7 @@ class Tagger:
...
@@ -32,7 +32,7 @@ class Tagger:
self
.
text
=
self
.
clean_text
(
text
)
self
.
text
=
self
.
clean_text
(
text
)
grammar
=
nltk
.
RegexpParser
(
label
+
': '
+
rule
)
grammar
=
nltk
.
RegexpParser
(
label
+
': '
+
rule
)
tagged_tokens
=
list
(
self
.
tag_text
(
self
.
text
))
tagged_tokens
=
list
(
self
.
tag_text
(
self
.
text
))
print
(
"the tagged_tokens"
,
tagged_tokens
)
#
print("the tagged_tokens", tagged_tokens)
if
len
(
tagged_tokens
):
if
len
(
tagged_tokens
):
grammar_parsed
=
grammar
.
parse
(
tagged_tokens
)
grammar_parsed
=
grammar
.
parse
(
tagged_tokens
)
for
subtree
in
grammar_parsed
.
subtrees
():
for
subtree
in
grammar_parsed
.
subtrees
():
...
...
gargantext/util/toolchain/ngrams_extraction.py
View file @
e8d5e001
...
@@ -11,7 +11,8 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
...
@@ -11,7 +11,8 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
£TODO: load whole word dictionary in ram and check existence before inserting to db => sequential insert => probably faster!
£TODO: load whole word dictionary in ram and check existence before inserting to db => sequential insert => probably faster!
"""
"""
print
(
'INTEGRATE'
,
len
(
ngrams_data
),
len
(
nodes_ngrams_count
))
# print('INTEGRATE', len(ngrams_data), len(nodes_ngrams_count))
print
(
'INTEGRATE'
)
# integrate ngrams (aka new words)
# integrate ngrams (aka new words)
ngrams_ids
=
bulk_insert_ifnotexists
(
ngrams_ids
=
bulk_insert_ifnotexists
(
model
=
Ngram
,
# todo type should :str ~~> :str|:re) !!!
model
=
Ngram
,
# todo type should :str ~~> :str|:re) !!!
...
@@ -118,7 +119,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
...
@@ -118,7 +119,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
# integrate ngrams and nodes-ngrams
# integrate ngrams and nodes-ngrams
if
len
(
nodes_ngrams_count
)
>=
BATCH_NGRAMSEXTRACTION_SIZE
:
if
len
(
nodes_ngrams_count
)
>=
BATCH_NGRAMSEXTRACTION_SIZE
:
print
(
len
(
nodes_ngrams_count
),
">="
,
BATCH_NGRAMSEXTRACTION_SIZE
)
#
print(len(nodes_ngrams_count),">=", BATCH_NGRAMSEXTRACTION_SIZE)
_integrate_associations
(
nodes_ngrams_count
,
ngrams_data
,
db
,
cursor
)
_integrate_associations
(
nodes_ngrams_count
,
ngrams_data
,
db
,
cursor
)
nodes_ngrams_count
.
clear
()
nodes_ngrams_count
.
clear
()
ngrams_data
.
clear
()
ngrams_data
.
clear
()
...
...
gargantext/util/toolchain/parsing.py
View file @
e8d5e001
...
@@ -68,7 +68,8 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
...
@@ -68,7 +68,8 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
lang_result
[
'skipped'
]
.
append
(
hyperdata
[
"language_name"
])
lang_result
[
'skipped'
]
.
append
(
hyperdata
[
"language_name"
])
else
:
else
:
print
(
"[WARNING] no language_* found in document [parsing.py]"
)
print
(
"WARNING no language_* found in document [parsing.py] => "
+
(
"(detecting)"
if
DETECT_LANG
else
"(using default)"
))
if
DETECT_LANG
:
if
DETECT_LANG
:
#no language have been indexed
#no language have been indexed
...
@@ -93,7 +94,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
...
@@ -93,7 +94,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
def
parse
(
corpus
):
def
parse
(
corpus
):
try
:
try
:
print
(
"PARSING"
)
print
(
"PARSING"
)
print
(
"DETECT_LANG?"
,
DETECT_LANG
)
#
print("DETECT_LANG?", DETECT_LANG)
corpus
.
status
(
'Docs'
,
progress
=
0
)
corpus
.
status
(
'Docs'
,
progress
=
0
)
#1 corpus => 1 or multi resources.path (for crawlers)
#1 corpus => 1 or multi resources.path (for crawlers)
resources
=
corpus
.
resources
()
resources
=
corpus
.
resources
()
...
@@ -107,7 +108,9 @@ def parse(corpus):
...
@@ -107,7 +108,9 @@ def parse(corpus):
#corpus.status(error)
#corpus.status(error)
raise
ValueError
(
"Resource '
%
s' has no Parser"
%
resource
[
"name"
])
raise
ValueError
(
"Resource '
%
s' has no Parser"
%
resource
[
"name"
])
parserbot
=
load_parser
(
source
)
parserbot
=
load_parser
(
source
)
print
(
parserbot
)
# print(parserbot)
#observed languages in default languages
#observed languages in default languages
observed_languages
=
[]
observed_languages
=
[]
#skipped_languages
#skipped_languages
...
@@ -218,10 +221,10 @@ def parse(corpus):
...
@@ -218,10 +221,10 @@ def parse(corpus):
#les jolis iso2
#les jolis iso2
observed_langs
=
dict
(
Counter
(
observed_languages
))
observed_langs
=
dict
(
Counter
(
observed_languages
))
print
(
"#LANGAGES OK"
)
#
print("#LANGAGES OK")
print
(
observed_langs
)
#
print(observed_langs)
print
(
"#LANGUAGES UNKNOWN"
)
#
print("#LANGUAGES UNKNOWN")
print
(
skipped_langs
)
#
print(skipped_langs)
top_langs
=
sorted
(
observed_langs
.
items
(),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
top_langs
=
sorted
(
observed_langs
.
items
(),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
if
len
(
top_langs
)
>
0
:
if
len
(
top_langs
)
>
0
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment