Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
4211b1e2
Commit
4211b1e2
authored
Aug 26, 2016
by
c24b
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Revert taggers modifications
parent
aa70aaa8
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
35 additions
and
36 deletions
+35
-36
NltkTagger.py
gargantext/util/taggers/NltkTagger.py
+2
-14
TreeTagger.py
gargantext/util/taggers/TreeTagger.py
+0
-1
TurboTagger.py
gargantext/util/taggers/TurboTagger.py
+1
-2
_Tagger.py
gargantext/util/taggers/_Tagger.py
+2
-1
ngrams_extraction.py
gargantext/util/toolchain/ngrams_extraction.py
+4
-4
parsing.py
gargantext/util/toolchain/parsing.py
+26
-14
No files found.
gargantext/util/taggers/NltkTagger.py
View file @
4211b1e2
...
...
@@ -8,22 +8,10 @@ class NltkTagger(Tagger):
#import nltk
def
__init__
(
self
,
*
args
,
**
kwargs
):
self
.
tagr
=
PerceptronTagger
()
#
super(self.__class__, self).__init__(*args, **kwargs)
super
(
self
.
__class__
,
self
)
.
__init__
(
*
args
,
**
kwargs
)
#def __start__(self):
#
~
def __start__(self):
#~ self.tagr = PerceptronTagger()
def
tag_tokens
(
self
,
tokens
,
single
=
True
):
return
self
.
tagr
.
tag
(
tokens
)
# def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
# self.text = self.clean_text(text)
# grammar = nltk.RegexpParser(label + ': ' + rule)
# tagged_tokens = list(self.tag_text(self.text))
# if len(tagged_tokens):
# grammar_parsed = grammar.parse(tagged_tokens)
# for subtree in grammar_parsed.subtrees():
# if subtree.label() == label:
# if len(subtree) < max_n_words:
# yield subtree.leaves()
# # ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
gargantext/util/taggers/TreeTagger.py
View file @
4211b1e2
...
...
@@ -74,7 +74,6 @@ class TreeTagger(Tagger):
self
.
_input
,
self
.
_output
=
self
.
_popen
.
stdin
,
self
.
_popen
.
stdout
# self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )).start()
# self.buffer = OutputBuffer()
# self.extract(self.text)
def
stop
(
self
):
# terminates the 'treetagger' process
...
...
gargantext/util/taggers/TurboTagger.py
View file @
4211b1e2
...
...
@@ -6,13 +6,12 @@ class TurboTagger:
def
start
(
self
):
self
.
_nlpclient
=
NLPClient
()
#self.extract(self.text)
def
stop
(
self
):
if
hasattr
(
self
,
'_nlpclient'
):
del
self
.
_nlpclient
def
extrac
t
(
self
,
text
):
def
tag_tex
t
(
self
,
text
):
if
not
hasattr
(
self
,
'_nlpclient'
):
self
.
_nlpclient
=
NLPClient
()
try
:
...
...
gargantext/util/taggers/_Tagger.py
View file @
4211b1e2
...
...
@@ -19,7 +19,8 @@ class Tagger:
| [][.,;"'?!():-_`] # these are separate tokens
'''
,
re
.
UNICODE
|
re
.
MULTILINE
|
re
.
DOTALL
)
self
.
buffer
=
[]
self
.
start
()
#self.start()
def
clean_text
(
self
,
text
):
...
...
gargantext/util/toolchain/ngrams_extraction.py
View file @
4211b1e2
...
...
@@ -9,7 +9,7 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
"""
@param ngrams_data a set like {('single word', 2), ('apple', 1),...}
"""
#
print('INTEGRATE')
print
(
'INTEGRATE'
)
# integrate ngrams
ngrams_ids
=
bulk_insert_ifnotexists
(
model
=
Ngram
,
...
...
@@ -59,7 +59,6 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
#sort docs by lang?
# for lang, tagger in tagger_bots.items():
for
documents_count
,
document
in
enumerate
(
corpus
.
children
(
'DOCUMENT'
)):
if
document
.
id
not
in
corpus
.
hyperdata
[
"skipped_docs"
]:
language_iso2
=
document
.
hyperdata
.
get
(
'language_iso2'
)
if
language_iso2
not
in
supported_taggers_lang
:
...
...
@@ -73,7 +72,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
else
:
tagger
=
tagger_bots
[
language_iso2
]
print
(
tagger
)
#print(language_iso2)
#>>> romain-stable-patch
#to do verify if document has no KEYS to index
...
...
@@ -109,10 +108,11 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
# integrate ngrams and nodes-ngrams
if
len
(
nodes_ngrams_count
)
>=
BATCH_NGRAMSEXTRACTION_SIZE
:
print
(
len
(
nodes_ngrams_count
),
">="
,
BATCH_NGRAMSEXTRACTION_SIZE
)
_integrate_associations
(
nodes_ngrams_count
,
ngrams_data
,
db
,
cursor
)
nodes_ngrams_count
.
clear
()
ngrams_data
.
clear
()
if
documents_count
%
BATCH_
NGRAMSEXTRACTION
_SIZE
==
0
:
if
documents_count
%
BATCH_
PARSING
_SIZE
==
0
:
corpus
.
status
(
'Ngrams'
,
progress
=
documents_count
+
1
)
corpus
.
save_hyperdata
()
session
.
add
(
corpus
)
...
...
gargantext/util/toolchain/parsing.py
View file @
4211b1e2
...
...
@@ -99,15 +99,14 @@ def parse(corpus):
#skipped docs to remember for later processing
skipped_docs
=
[]
documents_count
=
0
#BY RESOURCE
for
i
,
resource
in
enumerate
(
resources
):
if
resource
[
"extracted"
]
is
True
:
continue
else
:
# BY documents
d
=
0
for
documents_count
,
hyperdata
in
enumerate
(
parserbot
(
resource
[
"path"
])):
for
hyperdata
in
parserbot
(
resource
[
"path"
]):
# indexed text fields defined in CONSTANTS
for
k
in
DEFAULT_INDEX_FIELDS
:
if
k
in
hyperdata
.
keys
():
...
...
@@ -126,32 +125,45 @@ def parse(corpus):
name
=
hyperdata
.
get
(
'title'
,
''
)[:
255
],
hyperdata
=
hyperdata
,
)
#corpus.save_hyperdata()
# session.add(document)
# session.commit()
session
.
add
(
document
)
session
.
commit
()
if
"error"
in
hyperdata
.
keys
():
#document.status("error")
document
.
status
(
'Parsing'
,
error
=
document
.
hyperdata
[
"error"
])
document
.
save_hyperdata
()
#document.status('Parsing', error= document.hyperdata["error"])
#document.save_hyperdata()
#session.add(document)
#session.commit()
#adding skipped_docs for later processsing if error in parsing
skipped_docs
.
append
(
document
.
id
)
#BATCH_PARSING_SIZE
if
documents_count
%
BATCH_PARSING_SIZE
==
0
:
corpus
.
status
(
'Docs'
,
progress
=
documents_count
)
corpus
.
save_hyperdata
()
#
session.add(corpus)
#
session.commit()
session
.
add
(
corpus
)
session
.
commit
()
documents_count
+=
1
# update info about the resource
resource
[
'extracted'
]
=
True
#print( "resource n°",i, ":", d, "docs inside this file")
#finally store documents for this corpus
session
.
add
(
corpus
)
session
.
commit
()
#finally store documents for this corpus
corpus
.
status
(
'Parsing'
,
progress
=
documents_count
+
1
,
complete
=
True
)
#corpus.status('Parsing', complete =True)
corpus
.
save_hyperdata
()
#session.add(corpus)
#session.commit()
#adding parsing error to document level
for
node_id
in
skipped_docs
:
node
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
node_id
)
.
first
()
node
.
status
(
"Parsing"
,
"Error in parsing"
)
node
.
save_hyperdata
()
#session.flush()
#skipped_nodes = session.query(Node).filter(Node.id.in_(skipped_docs)).all()
#mods = [node.status('Parsing', "Error in parsing:skipped") for node in skipped_nodes]
#STORING AGREGATIONS INFO (STATS)
#skipped_docs
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment