Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
f93d4266
Commit
f93d4266
authored
Aug 26, 2016
by
c24b
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
LANG => tagger + stemmer
parent
7c61a9fa
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
39 additions
and
57 deletions
+39
-57
constants.py
gargantext/constants.py
+6
-27
NltkTagger.py
gargantext/util/taggers/NltkTagger.py
+12
-12
ngram_groups.py
gargantext/util/toolchain/ngram_groups.py
+5
-7
ngrams_extraction.py
gargantext/util/toolchain/ngrams_extraction.py
+1
-1
parsing.py
gargantext/util/toolchain/parsing.py
+15
-10
No files found.
gargantext/constants.py
View file @
f93d4266
...
...
@@ -131,8 +131,7 @@ def get_resource_by_name(sourcename):
# taggers -----------------------------------------------
def
get_tagger
(
lang
):
'''
lang => default langage[0] => Tagger
lang => observed language[0] => Tagger
'''
name
=
LANGUAGES
[
lang
][
"tagger"
]
module
=
"gargantext.util.taggers.
%
s"
%
(
name
)
...
...
@@ -150,7 +149,6 @@ RESOURCETYPES = [
'parser'
:
"EuropresseParser"
,
'file_formats'
:[
"zip"
,
"txt"
],
'crawler'
:
None
,
'default_languages'
:
[
'en'
,
'fr'
],
},
{
'type'
:
2
,
'name'
:
'Jstor [RIS]'
,
...
...
@@ -158,7 +156,6 @@ RESOURCETYPES = [
'parser'
:
"RISParser"
,
'file_formats'
:[
"zip"
,
"txt"
],
'crawler'
:
None
,
'default_languages'
:
[
'en'
],
},
{
'type'
:
3
,
'name'
:
'Pubmed [XML]'
,
...
...
@@ -166,7 +163,6 @@ RESOURCETYPES = [
'parser'
:
"PubmedParser"
,
'file_formats'
:[
"zip"
,
"xml"
],
'crawler'
:
"PubmedCrawler"
,
'default_languages'
:
[
'en'
],
},
{
'type'
:
4
,
'name'
:
'Scopus [RIS]'
,
...
...
@@ -174,7 +170,6 @@ RESOURCETYPES = [
'parser'
:
"RISParser"
,
'file_formats'
:[
"zip"
,
"txt"
],
'crawler'
:
None
,
'default_languages'
:
[
'en'
],
},
{
'type'
:
5
,
'name'
:
'Web of Science [ISI]'
,
...
...
@@ -183,7 +178,6 @@ RESOURCETYPES = [
'file_formats'
:[
"zip"
,
"txt"
],
#'crawler': "ISICrawler",
'crawler'
:
None
,
'default_languages'
:
[
'en'
],
},
{
'type'
:
6
,
'name'
:
'Zotero [RIS]'
,
...
...
@@ -191,7 +185,6 @@ RESOURCETYPES = [
'parser'
:
'RISParser'
,
'file_formats'
:[
"zip"
,
"ris"
,
"txt"
],
'crawler'
:
None
,
'default_languages'
:
[
'en'
],
},
{
'type'
:
7
,
'name'
:
'CSV'
,
...
...
@@ -199,7 +192,6 @@ RESOURCETYPES = [
'parser'
:
'CSVParser'
,
'file_formats'
:[
"zip"
,
"csv"
],
'crawler'
:
None
,
'default_languages'
:
[
'en'
],
},
{
'type'
:
8
,
'name'
:
'ISTex'
,
...
...
@@ -207,7 +199,6 @@ RESOURCETYPES = [
'parser'
:
"ISTexParser"
,
'file_formats'
:[
"zip"
,
"txt"
],
'crawler'
:
None
,
'default_languages'
:
[
'en'
,
'fr'
],
},
{
"type"
:
9
,
"name"
:
'SCOAP [XML]'
,
...
...
@@ -215,7 +206,6 @@ RESOURCETYPES = [
"format"
:
'MARC21'
,
'file_formats'
:[
"zip"
,
"xml"
],
"crawler"
:
"CernCrawler"
,
'default_languages'
:
[
'en'
],
},
{
"type"
:
10
,
"name"
:
'REPEC [RIS]'
,
...
...
@@ -223,7 +213,6 @@ RESOURCETYPES = [
"format"
:
'RIS'
,
'file_formats'
:[
"zip"
,
"ris"
,
"txt"
],
"crawler"
:
None
,
'default_languages'
:
[
'en'
],
},
]
#shortcut for resources declaration in template
...
...
@@ -278,13 +267,11 @@ def load_tagger(lang):
given a LANG load the corresponding tagger
lang(str) > Tagger(Object)
'''
try
:
filename
=
LANGUAGES
[
lang
][
"tagger"
]
module
=
'gargantext.util.taggers.
%
s'
%
(
filename
)
module
=
importlib
.
import_module
(
module
)
return
getattr
(
module
,
filename
)()
except
:
raise
ImportError
(
"No tagger for this lang
%
s TIP: declare a new parser in LANGUAGES"
%
lang
)
filename
=
LANGUAGES
[
lang
][
"tagger"
]
module
=
'gargantext.util.taggers.
%
s'
%
(
filename
)
module
=
importlib
.
import_module
(
module
)
return
getattr
(
module
,
filename
)()
# linguistic extraction parameters ---------------------------------------------
...
...
@@ -361,14 +348,6 @@ QUERY_SIZE_N_MAX = 1000
QUERY_SIZE_N_DEFAULT
=
1000
# Grammar rules for chunking
RULE_JJNN
=
"{<JJ.*>*<NN.*|>+<JJ.*>*}"
RULE_NPN
=
"{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}"
RULE_TINA
=
"^((VBD,|VBG,|VBN,|CD.?,|JJ.?,|
\
?,){0,2}?(N.?.?,|
\
?,)+?(CD.,)??)
\
+?((PREP.?|DET.?,|IN.?,|CC.?,|
\
?,)((VBD,|VBG,|VBN,|CD.?,|JJ.?,|
\
?
\
,){0,2}?(N.?.?,|
\
?,)+?)+?)*?$"
# ------------------------------------------------------------------------------
# Graph constraints to compute the graph:
# Modes: live graph generation, graph asynchronously computed or errors detected
...
...
gargantext/util/taggers/NltkTagger.py
View file @
f93d4266
...
...
@@ -8,7 +8,7 @@ class NltkTagger(Tagger):
#import nltk
def
__init__
(
self
,
*
args
,
**
kwargs
):
self
.
tagr
=
PerceptronTagger
()
super
(
self
.
__class__
,
self
)
.
__init__
(
*
args
,
**
kwargs
)
#
super(self.__class__, self).__init__(*args, **kwargs)
#def __start__(self):
#~ self.tagr = PerceptronTagger()
...
...
@@ -16,14 +16,14 @@ class NltkTagger(Tagger):
def
tag_tokens
(
self
,
tokens
,
single
=
True
):
return
self
.
tagr
.
tag
(
tokens
)
def
extract
(
self
,
text
,
rule
=
RULE_JJNN
,
label
=
'NP'
,
max_n_words
=
DEFAULT_MAX_NGRAM_LEN
):
self
.
text
=
self
.
clean_text
(
text
)
grammar
=
nltk
.
RegexpParser
(
label
+
': '
+
rule
)
tagged_tokens
=
list
(
self
.
tag_text
(
self
.
text
))
if
len
(
tagged_tokens
):
grammar_parsed
=
grammar
.
parse
(
tagged_tokens
)
for
subtree
in
grammar_parsed
.
subtrees
():
if
subtree
.
label
()
==
label
:
if
len
(
subtree
)
<
max_n_words
:
yield
subtree
.
leaves
()
# ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
#
def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
#
self.text = self.clean_text(text)
#
grammar = nltk.RegexpParser(label + ': ' + rule)
#
tagged_tokens = list(self.tag_text(self.text))
#
if len(tagged_tokens):
#
grammar_parsed = grammar.parse(tagged_tokens)
#
for subtree in grammar_parsed.subtrees():
#
if subtree.label() == label:
#
if len(subtree) < max_n_words:
#
yield subtree.leaves()
#
# ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
gargantext/util/toolchain/ngram_groups.py
View file @
f93d4266
...
...
@@ -26,8 +26,7 @@ def prepare_stemmers(corpus):
and formatted
"""
supported_stemmers_lang
=
[
lang
for
lang
in
corpus
.
hyperdata
[
"languages"
]
\
if
lang
!=
"__unknown__"
\
if
lang
in
LANGUAGES
.
keys
()]
if
lang
!=
"__unknown__"
]
stemmers
=
{
lang
:
SnowballStemmer
(
languages
[
lang
]
.
name
.
lower
())
for
lang
\
in
supported_stemmers_lang
}
stemmers
[
'__unknown__'
]
=
SnowballStemmer
(
"english"
)
...
...
@@ -51,9 +50,8 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
# and group if same stem/lemma
stemmers
=
prepare_stemmers
(
corpus
)
print
(
"# STEMMERS LOADED"
,
stemmers
)
supported_stemmers_lang
=
[
lang
for
lang
in
corpus
.
hyperdata
[
"languages"
]
\
if
lang
!=
"__unknown__"
\
and
lang
in
LANGUAGES
.
keys
()]
supported_stemmers_lang
=
[
lang
for
lang
in
corpus
.
hyperdata
[
"languages"
]
if
lang
!=
"__unknown__"
]
print
(
"#SUPPORTED STEMMERS LANGS"
,
supported_stemmers_lang
)
# todo dict {lg => {ngrams_todo} }
todo_ngrams_per_lg
=
defaultdict
(
set
)
...
...
@@ -64,8 +62,8 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
# preloop per doc to sort ngrams by language
for
doc
in
corpus
.
children
(
'DOCUMENT'
):
if
doc
.
id
not
in
corpus
.
hyperdata
[
'skipped_docs'
]:
if
(
'language_iso2'
in
doc
.
hyperdata
)
\
and
doc
.
hyperdata
[
'language_iso2'
]
in
supported_stemmers_lang
:
if
(
'language_iso2'
in
doc
.
hyperdata
)
and
doc
.
hyperdata
[
'language_iso2'
]
\
in
supported_stemmers_lang
:
lgid
=
doc
.
hyperdata
[
'language_iso2'
]
else
:
...
...
gargantext/util/toolchain/ngrams_extraction.py
View file @
f93d4266
...
...
@@ -52,7 +52,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
#load available taggers for default langage of plateform
#print(LANGUAGES.keys())
tagger_bots
=
{
lang
:
load_tagger
(
lang
)
for
lang
in
corpus
.
hyperdata
[
"languages"
]
\
if
lang
!=
"__unknown__"
and
lang
in
LANGUAGES
.
keys
()
}
if
lang
!=
"__unknown__"
}
print
(
"#TAGGERS LOADED: "
,
tagger_bots
)
supported_taggers_lang
=
tagger_bots
.
keys
()
print
(
"#SUPPORTED TAGGER LANGS"
,
supported_taggers_lang
)
...
...
gargantext/util/toolchain/parsing.py
View file @
f93d4266
...
...
@@ -14,8 +14,10 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
'''
if
"language_iso2"
in
hyperdata
.
keys
():
observed_languages
.
append
(
hyperdata
[
"language_iso2"
])
if
hyperdata
[
"language_iso2"
]
not
in
LANGUAGES
.
keys
():
skipped_languages
.
append
(
hyperdata
[
"language_iso2"
])
return
observed_languages
,
skipped_languages
observed_languages
[
hyperdata
[
"language_iso2"
]]
return
observed_languages
,
skipped_languages
...
...
@@ -23,6 +25,9 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
#convert
try
:
lang
=
languages
[
hyperdata
[
"language_iso3"
]]
.
iso2
if
lang
not
in
LANGUAGES
.
keys
():
skipped_languages
.
append
(
lang
)
return
observed_languages
,
skipped_languages
observed_languages
.
append
(
lang
)
return
observed_languages
,
skipped_languages
except
KeyError
:
...
...
@@ -35,6 +40,9 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
try
:
#convert
lang
=
hyperdata
[
"language_fullname"
]
.
iso2
if
lang
not
in
LANGUAGES
.
keys
():
skipped_languages
.
append
(
lang
)
return
observed_languages
,
skipped_languages
observed_languages
.
append
(
lang
)
return
observed_languages
,
skipped_languages
except
KeyError
:
...
...
@@ -59,6 +67,9 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
else
:
#detect_lang return iso2
lang
=
detect_lang
(
text
)
if
lang
not
in
LANGUAGES
.
keys
():
skipped_languages
.
append
(
lang
)
return
observed_languages
,
skipped_languages
observed_languages
.
append
(
lang
)
return
observed_languages
,
skipped_languages
...
...
@@ -80,6 +91,7 @@ def parse(corpus):
#corpus.status(error)
raise
ValueError
(
"Resource '
%
s' has no Parser"
%
resource
[
"name"
])
parserbot
=
load_parser
(
source
)
print
(
parserbot
)
#observed languages in default languages
observed_languages
=
[]
#skipped_languages
...
...
@@ -167,14 +179,7 @@ def parse(corpus):
corpus
.
save_hyperdata
()
#TODO: assign main lang of the corpus to unsupported languages docs
# for d_id in corpus.skipped_docs:
# document = session.query(Node).filter(Node.id == d_id, Node.typename == "DOCUMENT").first()
# if document.hyperdata["error"].startswith("Error: unsupported language"):
# print(document.hyperdata["language_iso2"])
# document.hyperdata["language_iso2"] = corpus.language_id
# document.save_hyperdata()
# session.commit()
except
Exception
as
error
:
corpus
.
status
(
'Docs'
,
error
=
error
)
corpus
.
save_hyperdata
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment