Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
0f26d8a2
Commit
0f26d8a2
authored
Jul 27, 2016
by
c24b
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Adding a call to Tagger directly getting rid of ngramsextractors wrapper
parent
403913fc
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
70 additions
and
3 deletions
+70
-3
ngramsextractors.py
gargantext/util/ngramsextractors.py
+45
-0
_Tagger.py
gargantext/util/taggers/_Tagger.py
+25
-3
No files found.
gargantext/util/ngramsextractors.py
0 → 100644
View file @
0f26d8a2
from
gargantext.util.languages
import
languages
from
gargantext.constants
import
LANGUAGES
,
DEFAULT_MAX_NGRAM_LEN
,
RULE_JJNN
,
RULE_NPN
import
nltk
import
re
class
NgramsExtractor
:
def
__init__
(
self
,
tagger
):
self
.
_tagger
=
tagger
()
@
staticmethod
def
clean_text
(
text
):
"""Clean the text for better POS tagging.
For now, only removes (short) XML tags.
"""
return
re
.
sub
(
r'<[^>]{0,45}>'
,
''
,
text
)
def
extract
(
self
,
text
,
rule
=
RULE_JJNN
,
label
=
'NP'
,
max_n_words
=
DEFAULT_MAX_NGRAM_LEN
):
text
=
self
.
clean_text
(
text
)
grammar
=
nltk
.
RegexpParser
(
label
+
': '
+
rule
)
tagged_tokens
=
list
(
self
.
_tagger
.
tag_text
(
text
))
if
len
(
tagged_tokens
):
grammar_parsed
=
grammar
.
parse
(
tagged_tokens
)
for
subtree
in
grammar_parsed
.
subtrees
():
if
subtree
.
label
()
==
label
:
if
len
(
subtree
)
<
max_n_words
:
yield
subtree
.
leaves
()
# ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
class
NgramsExtractors
(
dict
):
def
__missing__
(
self
,
key
):
if
not
isinstance
(
key
,
str
):
raise
KeyError
if
len
(
key
)
==
2
and
key
==
key
.
lower
():
tagger
=
LANGUAGES
[
key
][
'tagger'
]
self
[
key
]
=
NgramsExtractor
(
tagger
)
else
:
self
[
key
]
=
self
[
LANGUAGES
[
key
]
.
iso3
]
return
self
[
key
]
# this below will be shared within the current thread
ngramsextractors
=
NgramsExtractors
()
gargantext/util/taggers/_Tagger.py
View file @
0f26d8a2
...
...
@@ -3,13 +3,13 @@ When started, it initiates the parser;
when passed text, the text is piped to the parser.
When ended, the parser is closed and the tagged word returned as a tuple.
"""
from
constants
import
RULE_JJNN
,
DEFAULT_MAX_NGRAM_LEN
import
re
import
nltk
class
Tagger
:
def
__init__
(
self
):
def
__init__
(
self
,
text
):
# This regular expression is really good at tokenizing a text!
self
.
_re_sentence
=
re
.
compile
(
r'''(?x) # set flag to allow verbose regexps
(?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
...
...
@@ -19,8 +19,29 @@ class Tagger:
| [][.,;"'?!():-_`] # these are separate tokens
'''
,
re
.
UNICODE
|
re
.
MULTILINE
|
re
.
DOTALL
)
self
.
buffer
=
[]
self
.
text
=
clean_text
(
text
)
self
.
start
()
def
clean_text
(
text
):
"""Clean the text for better POS tagging.
For now, only removes (short) XML tags.
"""
return
re
.
sub
(
r'<[^>]{0,45}>'
,
''
,
text
)
def
extract
(
self
,
text
,
rule
=
RULE_JJNN
,
label
=
'NP'
,
max_n_words
=
DEFAULT_MAX_NGRAM_LEN
):
text
=
self
.
clean_text
(
text
)
grammar
=
nltk
.
RegexpParser
(
label
+
': '
+
rule
)
tagged_tokens
=
list
(
self
.
tag_text
(
self
.
text
))
if
len
(
tagged_tokens
):
grammar_parsed
=
grammar
.
parse
(
tagged_tokens
)
for
subtree
in
grammar_parsed
.
subtrees
():
if
subtree
.
label
()
==
label
:
if
len
(
subtree
)
<
max_n_words
:
yield
subtree
.
leaves
()
# ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
def
__del__
(
self
):
self
.
stop
()
...
...
@@ -29,6 +50,7 @@ class Tagger:
This method is called by the constructor, and can be overriden by
inherited classes.
"""
self
.
extract
(
self
.
text
)
def
stop
(
self
):
"""Ends the tagger.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment