Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
bd714567
Commit
bd714567
authored
Aug 26, 2016
by
c24b
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Revert taggers modifications
parent
e34b48b5
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
35 additions
and
36 deletions
+35
-36
NltkTagger.py
gargantext/util/taggers/NltkTagger.py
+2
-14
TreeTagger.py
gargantext/util/taggers/TreeTagger.py
+0
-1
TurboTagger.py
gargantext/util/taggers/TurboTagger.py
+1
-2
_Tagger.py
gargantext/util/taggers/_Tagger.py
+2
-1
ngrams_extraction.py
gargantext/util/toolchain/ngrams_extraction.py
+4
-4
parsing.py
gargantext/util/toolchain/parsing.py
+26
-14
No files found.
gargantext/util/taggers/NltkTagger.py
View file @
bd714567
...
@@ -8,22 +8,10 @@ class NltkTagger(Tagger):
...
@@ -8,22 +8,10 @@ class NltkTagger(Tagger):
#import nltk
#import nltk
def
__init__
(
self
,
*
args
,
**
kwargs
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
self
.
tagr
=
PerceptronTagger
()
self
.
tagr
=
PerceptronTagger
()
#
super(self.__class__, self).__init__(*args, **kwargs)
super
(
self
.
__class__
,
self
)
.
__init__
(
*
args
,
**
kwargs
)
#def __start__(self):
#
~
def __start__(self):
#~ self.tagr = PerceptronTagger()
#~ self.tagr = PerceptronTagger()
def
tag_tokens
(
self
,
tokens
,
single
=
True
):
def
tag_tokens
(
self
,
tokens
,
single
=
True
):
return
self
.
tagr
.
tag
(
tokens
)
return
self
.
tagr
.
tag
(
tokens
)
# def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
# self.text = self.clean_text(text)
# grammar = nltk.RegexpParser(label + ': ' + rule)
# tagged_tokens = list(self.tag_text(self.text))
# if len(tagged_tokens):
# grammar_parsed = grammar.parse(tagged_tokens)
# for subtree in grammar_parsed.subtrees():
# if subtree.label() == label:
# if len(subtree) < max_n_words:
# yield subtree.leaves()
# # ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
gargantext/util/taggers/TreeTagger.py
View file @
bd714567
...
@@ -74,7 +74,6 @@ class TreeTagger(Tagger):
...
@@ -74,7 +74,6 @@ class TreeTagger(Tagger):
self
.
_input
,
self
.
_output
=
self
.
_popen
.
stdin
,
self
.
_popen
.
stdout
self
.
_input
,
self
.
_output
=
self
.
_popen
.
stdin
,
self
.
_popen
.
stdout
# self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )).start()
# self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )).start()
# self.buffer = OutputBuffer()
# self.buffer = OutputBuffer()
# self.extract(self.text)
def
stop
(
self
):
def
stop
(
self
):
# terminates the 'treetagger' process
# terminates the 'treetagger' process
...
...
gargantext/util/taggers/TurboTagger.py
View file @
bd714567
...
@@ -6,13 +6,12 @@ class TurboTagger:
...
@@ -6,13 +6,12 @@ class TurboTagger:
def
start
(
self
):
def
start
(
self
):
self
.
_nlpclient
=
NLPClient
()
self
.
_nlpclient
=
NLPClient
()
#self.extract(self.text)
def
stop
(
self
):
def
stop
(
self
):
if
hasattr
(
self
,
'_nlpclient'
):
if
hasattr
(
self
,
'_nlpclient'
):
del
self
.
_nlpclient
del
self
.
_nlpclient
def
extrac
t
(
self
,
text
):
def
tag_tex
t
(
self
,
text
):
if
not
hasattr
(
self
,
'_nlpclient'
):
if
not
hasattr
(
self
,
'_nlpclient'
):
self
.
_nlpclient
=
NLPClient
()
self
.
_nlpclient
=
NLPClient
()
try
:
try
:
...
...
gargantext/util/taggers/_Tagger.py
View file @
bd714567
...
@@ -19,7 +19,8 @@ class Tagger:
...
@@ -19,7 +19,8 @@ class Tagger:
| [][.,;"'?!():-_`] # these are separate tokens
| [][.,;"'?!():-_`] # these are separate tokens
'''
,
re
.
UNICODE
|
re
.
MULTILINE
|
re
.
DOTALL
)
'''
,
re
.
UNICODE
|
re
.
MULTILINE
|
re
.
DOTALL
)
self
.
buffer
=
[]
self
.
buffer
=
[]
self
.
start
()
#self.start()
def
clean_text
(
self
,
text
):
def
clean_text
(
self
,
text
):
...
...
gargantext/util/toolchain/ngrams_extraction.py
View file @
bd714567
...
@@ -9,7 +9,7 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
...
@@ -9,7 +9,7 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
"""
"""
@param ngrams_data a set like {('single word', 2), ('apple', 1),...}
@param ngrams_data a set like {('single word', 2), ('apple', 1),...}
"""
"""
#
print('INTEGRATE')
print
(
'INTEGRATE'
)
# integrate ngrams
# integrate ngrams
ngrams_ids
=
bulk_insert_ifnotexists
(
ngrams_ids
=
bulk_insert_ifnotexists
(
model
=
Ngram
,
model
=
Ngram
,
...
@@ -59,7 +59,6 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
...
@@ -59,7 +59,6 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
#sort docs by lang?
#sort docs by lang?
# for lang, tagger in tagger_bots.items():
# for lang, tagger in tagger_bots.items():
for
documents_count
,
document
in
enumerate
(
corpus
.
children
(
'DOCUMENT'
)):
for
documents_count
,
document
in
enumerate
(
corpus
.
children
(
'DOCUMENT'
)):
if
document
.
id
not
in
corpus
.
hyperdata
[
"skipped_docs"
]:
if
document
.
id
not
in
corpus
.
hyperdata
[
"skipped_docs"
]:
language_iso2
=
document
.
hyperdata
.
get
(
'language_iso2'
)
language_iso2
=
document
.
hyperdata
.
get
(
'language_iso2'
)
if
language_iso2
not
in
supported_taggers_lang
:
if
language_iso2
not
in
supported_taggers_lang
:
...
@@ -73,7 +72,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
...
@@ -73,7 +72,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
else
:
else
:
tagger
=
tagger_bots
[
language_iso2
]
tagger
=
tagger_bots
[
language_iso2
]
print
(
tagger
)
#print(language_iso2)
#print(language_iso2)
#>>> romain-stable-patch
#>>> romain-stable-patch
#to do verify if document has no KEYS to index
#to do verify if document has no KEYS to index
...
@@ -109,10 +108,11 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
...
@@ -109,10 +108,11 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
# integrate ngrams and nodes-ngrams
# integrate ngrams and nodes-ngrams
if
len
(
nodes_ngrams_count
)
>=
BATCH_NGRAMSEXTRACTION_SIZE
:
if
len
(
nodes_ngrams_count
)
>=
BATCH_NGRAMSEXTRACTION_SIZE
:
print
(
len
(
nodes_ngrams_count
),
">="
,
BATCH_NGRAMSEXTRACTION_SIZE
)
_integrate_associations
(
nodes_ngrams_count
,
ngrams_data
,
db
,
cursor
)
_integrate_associations
(
nodes_ngrams_count
,
ngrams_data
,
db
,
cursor
)
nodes_ngrams_count
.
clear
()
nodes_ngrams_count
.
clear
()
ngrams_data
.
clear
()
ngrams_data
.
clear
()
if
documents_count
%
BATCH_
NGRAMSEXTRACTION
_SIZE
==
0
:
if
documents_count
%
BATCH_
PARSING
_SIZE
==
0
:
corpus
.
status
(
'Ngrams'
,
progress
=
documents_count
+
1
)
corpus
.
status
(
'Ngrams'
,
progress
=
documents_count
+
1
)
corpus
.
save_hyperdata
()
corpus
.
save_hyperdata
()
session
.
add
(
corpus
)
session
.
add
(
corpus
)
...
...
gargantext/util/toolchain/parsing.py
View file @
bd714567
...
@@ -99,15 +99,14 @@ def parse(corpus):
...
@@ -99,15 +99,14 @@ def parse(corpus):
#skipped docs to remember for later processing
#skipped docs to remember for later processing
skipped_docs
=
[]
skipped_docs
=
[]
documents_count
=
0
#BY RESOURCE
#BY RESOURCE
for
i
,
resource
in
enumerate
(
resources
):
for
i
,
resource
in
enumerate
(
resources
):
if
resource
[
"extracted"
]
is
True
:
if
resource
[
"extracted"
]
is
True
:
continue
continue
else
:
else
:
# BY documents
# BY documents
d
=
0
for
hyperdata
in
parserbot
(
resource
[
"path"
]):
for
documents_count
,
hyperdata
in
enumerate
(
parserbot
(
resource
[
"path"
])):
# indexed text fields defined in CONSTANTS
# indexed text fields defined in CONSTANTS
for
k
in
DEFAULT_INDEX_FIELDS
:
for
k
in
DEFAULT_INDEX_FIELDS
:
if
k
in
hyperdata
.
keys
():
if
k
in
hyperdata
.
keys
():
...
@@ -126,32 +125,45 @@ def parse(corpus):
...
@@ -126,32 +125,45 @@ def parse(corpus):
name
=
hyperdata
.
get
(
'title'
,
''
)[:
255
],
name
=
hyperdata
.
get
(
'title'
,
''
)[:
255
],
hyperdata
=
hyperdata
,
hyperdata
=
hyperdata
,
)
)
#corpus.save_hyperdata()
session
.
add
(
document
)
# session.add(document)
session
.
commit
()
# session.commit()
if
"error"
in
hyperdata
.
keys
():
if
"error"
in
hyperdata
.
keys
():
#document.status("error")
#document.status("error")
document
.
status
(
'Parsing'
,
error
=
document
.
hyperdata
[
"error"
])
#document.status('Parsing', error= document.hyperdata["error"])
document
.
save_hyperdata
()
#document.save_hyperdata()
#session.add(document)
#session.commit()
#adding skipped_docs for later processsing if error in parsing
#adding skipped_docs for later processsing if error in parsing
skipped_docs
.
append
(
document
.
id
)
skipped_docs
.
append
(
document
.
id
)
#BATCH_PARSING_SIZE
if
documents_count
%
BATCH_PARSING_SIZE
==
0
:
if
documents_count
%
BATCH_PARSING_SIZE
==
0
:
corpus
.
status
(
'Docs'
,
progress
=
documents_count
)
corpus
.
status
(
'Docs'
,
progress
=
documents_count
)
corpus
.
save_hyperdata
()
corpus
.
save_hyperdata
()
#
session.add(corpus)
session
.
add
(
corpus
)
#
session.commit()
session
.
commit
()
documents_count
+=
1
# update info about the resource
# update info about the resource
resource
[
'extracted'
]
=
True
resource
[
'extracted'
]
=
True
#print( "resource n°",i, ":", d, "docs inside this file")
#print( "resource n°",i, ":", d, "docs inside this file")
#finally store documents for this corpus
#finally store documents for this corpus
session
.
add
(
corpus
)
corpus
.
status
(
'Parsing'
,
progress
=
documents_count
+
1
,
complete
=
True
)
session
.
commit
()
#corpus.status('Parsing', complete =True)
corpus
.
save_hyperdata
()
#session.add(corpus)
#session.commit()
#adding parsing error to document level
for
node_id
in
skipped_docs
:
node
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
node_id
)
.
first
()
node
.
status
(
"Parsing"
,
"Error in parsing"
)
node
.
save_hyperdata
()
#session.flush()
#skipped_nodes = session.query(Node).filter(Node.id.in_(skipped_docs)).all()
#mods = [node.status('Parsing', "Error in parsing:skipped") for node in skipped_nodes]
#STORING AGREGATIONS INFO (STATS)
#STORING AGREGATIONS INFO (STATS)
#skipped_docs
#skipped_docs
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment