Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
6740e4df
Commit
6740e4df
authored
Jul 27, 2016
by
c24b
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
IMPORT constants crawlers+ taggers
parent
9c5c9d7e
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
198 additions
and
98 deletions
+198
-98
constants.py
gargantext/constants.py
+147
-94
__init__.py
gargantext/util/crawlers/__init__.py
+25
-0
__init__.py
gargantext/util/taggers/__init__.py
+26
-4
No files found.
gargantext/constants.py
View file @
6740e4df
# WARNING: to ensure consistency and retrocompatibility, lists should keep the
# initial order (ie., new elements should be appended at the end of the lists)
import
importlib
from
gargantext.util.lists
import
*
from
gargantext.util.tools
import
datetime
,
convert_to_date
import
re
# types & models (nodes, lists, hyperdata, resource) ---------------------------------------------
LISTTYPES
=
{
'DOCUMENT'
:
WeightedList
,
'GROUPLIST'
:
Translations
,
# todo remove "LIST" from name
...
...
@@ -19,16 +20,16 @@ LISTTYPES = {
'TFIDF-CORPUS'
:
WeightedIndex
,
'TFIDF-GLOBAL'
:
WeightedIndex
,
'TIRANK-LOCAL'
:
WeightedIndex
,
# could be WeightedList
'TIRANK-GLOBAL'
:
WeightedIndex
,
# could be WeightedList
}
# 'OWNLIST' : UnweightedList, # £TODO use this for any term-level tags
NODETYPES
=
[
# TODO separate id not array index, read by models.node
None
,
None
,
# 0
# documents hierarchy
'USER'
,
# 1
'PROJECT'
,
# 2
#RESOURCE should be here but last
'CORPUS'
,
# 3
'DOCUMENT'
,
# 4
# lists
...
...
@@ -45,11 +46,12 @@ NODETYPES = [
'TFIDF-GLOBAL'
,
# 14
# docs subset
'FAVORITES'
,
# 15
# more scores (sorry!)
'TIRANK-LOCAL'
,
# 16
'TIRANK-GLOBAL'
,
# 17
'GENCLUSION'
,
# 18
'RESOURCE'
,
# 19
]
INDEXED_HYPERDATA
=
{
...
...
@@ -114,34 +116,17 @@ INDEXED_HYPERDATA = {
}
#from gargantext.util.taggers import FrenchMeltTagger, TurboTagger
from
gargantext.util.taggers
import
NltkTagger
LANGUAGES
=
{
'en'
:
{
#'tagger': EnglishMeltTagger,
#'tagger': TurboTagger,
'tagger'
:
NltkTagger
,
},
'fr'
:
{
#'tagger': FrenchMeltTagger,
#'tagger': TreeTagger,
'tagger'
:
NltkTagger
,
},
}
from
gargantext.util.parsers
import
\
EuropressParser
,
RISParser
,
PubmedParser
,
ISIParser
,
CSVParser
,
ISTexParser
,
CernParser
,
RepecParser
# resources ---------------------------------------------
def
get_resource
(
sourcetype
):
'''resource :: type => resource dict'''
for
n
in
RESOURCETYPES
:
if
int
(
n
[
"type"
])
==
int
(
sourcetype
):
return
n
return
None
def
resourcetype
(
name
):
'''
resourcetype :: String -> Int
Usage : resourcetype("Europress (English)") == 1
Examples in scrapers scripts (Pubmed or ISTex for instance).
'''
return
[
n
[
0
]
for
n
in
enumerate
(
r
[
'name'
]
for
r
in
RESOURCETYPES
)
if
n
[
1
]
==
name
][
0
]
'''resource :: name => Int'''
return
[
n
[
"type"
]
for
n
in
RESOURCETYPES
if
n
[
"name"
]
==
name
][
0
]
def
resourcename
(
corpus
):
'''
...
...
@@ -149,79 +134,144 @@ def resourcename(corpus):
Usage : resourcename(corpus) == "ISTex"
'''
resource
=
corpus
.
resources
()[
0
]
resourcename
=
RESOURCETYPES
[
resource
[
'type'
]][
'name'
]
return
re
.
sub
(
r'\(.*'
,
''
,
resourcename
)
rtype
=
resource
[
"type"
]
return
get_resource
(
rtype
)[
"name"
]
def
get_tagger
(
lang
):
'''
lang => default langage[0] => tagger
'''
name
=
LANGUAGES
[
lang
][
"tagger"
]
module
=
"gargantext.util.taggers.
%
s"
%
(
name
)
module
=
importlib
.
import_module
(
module
,
""
)
tagger
=
getattr
(
module
,
name
)
print
(
tagger
)
return
tagger
()
RESOURCETYPES
=
[
# type 0
{
'name'
:
'Select database below'
,
'parser'
:
None
,
'default_language'
:
None
,
},
# type 1
{
'name'
:
'Europress (English)'
,
'parser'
:
EuropressParser
,
'default_language'
:
'en'
,
{
"type"
:
1
,
'name'
:
'Europress'
,
'format'
:
'Europress'
,
'parser'
:
"EuropressParser"
,
'file_formats'
:[
"zip"
],
'crawler'
:
None
,
'default_languages'
:
[
'en'
,
'fr'
],
},
# type 2
{
'name'
:
'Europress (French)'
,
'parser'
:
EuropressParser
,
'default_language'
:
'fr'
,
{
'type'
:
2
,
'name'
:
'Jstor'
,
'format'
:
'RIS'
,
'parser'
:
"RISParser"
,
'file_formats'
:[
"zip"
],
'crawler'
:
None
,
'default_languages'
:
[
'en'
],
},
# type 3
{
'name'
:
'Jstor (RIS format)'
,
'parser'
:
RISParser
,
'default_language'
:
'en'
,
{
'type'
:
3
,
'name'
:
'Pubmed'
,
'format'
:
'Pubmed'
,
'parser'
:
"PubmedParser"
,
'file_formats'
:[
"zip"
,
"xml"
],
'crawler'
:
"PubmedCrawler"
,
'default_languages'
:
[
'en'
],
},
# type 4
{
'name'
:
'Pubmed (XML format)'
,
'parser'
:
PubmedParser
,
'default_language'
:
'en'
,
{
'type'
:
4
,
'name'
:
'Scopus'
,
'format'
:
'RIS'
,
'parser'
:
"RISParser"
,
'file_formats'
:[
"zip"
],
'crawler'
:
None
,
'default_languages'
:
[
'en'
],
},
# type 5
{
'name'
:
'Scopus (RIS format)'
,
'parser'
:
RISParser
,
'default_language'
:
'en'
,
{
'type'
:
5
,
'name'
:
'Web of Science'
,
'format'
:
'ISI'
,
'parser'
:
"ISIParser"
,
'file_formats'
:[
"zip"
],
#'crawler': "ISICrawler",
'crawler'
:
None
,
'default_languages'
:
[
'en'
],
},
# type 6
{
'name'
:
'Web of Science (ISI format)'
,
'parser'
:
ISIParser
,
'default_language'
:
'en'
,
{
'type'
:
6
,
'name'
:
'Zotero'
,
'format'
:
'RIS'
,
'parser'
:
'RISParser'
,
'file_formats'
:[
"zip"
],
'crawler'
:
None
,
'default_languages'
:
[
'en'
],
},
# type 7
{
'name'
:
'Zotero (RIS format)'
,
'parser'
:
RISParser
,
'default_language'
:
'en'
,
{
'type'
:
7
,
'name'
:
'CSV'
,
'format'
:
'CSV'
,
'parser'
:
'CSVParser'
,
'file_formats'
:[
"zip"
,
"csv"
],
'crawler'
:
None
,
'default_languages'
:
[
'en'
],
},
# type 8
{
'name'
:
'CSV'
,
'parser'
:
CSVParser
,
'default_language'
:
'en'
,
{
'type'
:
8
,
'name'
:
'ISTex'
,
'format'
:
'ISI'
,
'parser'
:
"ISTexParser"
,
'file_formats'
:[
"zip"
],
#'crawler': "ISICrawler",
'crawler'
:
None
,
'default_languages'
:
[
'en'
,
'fr'
],
},
# type 9
{
'name'
:
'ISTex'
,
'parser'
:
ISTexParser
,
'default_language'
:
'en'
,
{
"type"
:
9
,
"name"
:
'SCOAP (XML MARC21 Format)'
,
"parser"
:
"CernParser"
,
"format"
:
'MARC21'
,
'file_formats'
:[
"zip"
,
"xml"
],
"crawler"
:
"CernCrawler"
,
'default_languages'
:
[
'en'
],
},
]
#shortcut for resources declaration
PARSERS
=
[(
n
[
"type"
],
n
[
"name"
])
for
n
in
RESOURCETYPES
if
n
[
"parser"
]
is
not
None
]
CRAWLERS
=
[(
n
[
"type"
],
n
[
"name"
])
for
n
in
RESOURCETYPES
if
n
[
"crawler"
]
is
not
None
]
def
load_parser
(
resource
):
filename
=
resource
.
parser
.
replace
(
"Parser"
,
''
)
module
=
'gargantext.util.parsers.
%
s'
%
(
filename
)
module
=
importlib
.
import_module
(
module
)
return
getattr
(
module
,
resource
.
parser
)
def
load_crawler
(
resource
):
filename
=
name
.
replace
(
"Crawler"
,
""
)
module
=
'gargantext.util.crawlers.
%
s'
%
(
filename
)
module
=
importlib
.
import_module
(
module
)
return
getattr
(
module
,
resource
.
crawler
)
# Supported languages and taggers ---------------------------------------------
#first declare the tagger using a string
#and it will be imported into gargantext.utils.taggers
LANGUAGES
=
{
'en'
:
{
#'tagger': 'EnglishMeltTagger',
#'tagger': "TurboTagger",
'tagger'
:
'NltkTagger'
,
},
# type 10
{
"type"
:
10
,
"name"
:
'SCOAP (XML MARC21 Format)'
,
"parser"
:
CernParser
,
"default_language"
:
"en"
,
'accepted_formats'
:[
"zip"
,
"xml"
],
#~ "scrapper": CernScrapper,
#~ "base_url": "http://api.scoap3.org/search?",
'fr'
:
{
#'tagger': "FrenchMeltTagger",
#'tagger': 'TreeTagger',
'tagger'
:
'NltkTagger'
,
},
}
# type 11
{
'name'
:
'REPEC (RIS format)'
,
'parser'
:
RepecParser
,
'default_language'
:
'en'
,
},
def
load_tagger
(
lang
):
filename
=
LANGUAGES
[
lang
]
module
=
'gargantext.util.taggers.
%
s'
%
(
filename
)
module
=
importlib
.
import_module
(
module
)
return
getattr
(
module
)
]
# linguistic extraction parameters ---------------------------------------------
DEFAULT_RANK_CUTOFF_RATIO
=
.75
# MAINLIST maximum terms in %
DEFAULT_RANK_HARD_LIMIT
=
5000
# MAINLIST maximum terms abs
...
...
@@ -257,19 +307,22 @@ DEFAULT_INDEX_SUBGRAMS = False # False <=> traditional
# "cool example".
# (all 1 to n-1 length ngrams,
# at indexing after extraction)
DEFAULT_INDEX_FIELDS
=
(
'title'
,
'abstract'
,
)
#Defaults Fields for ngrams extraction
# Grammar rules for chunking
RULE_JJNN
=
"{<JJ.*>*<NN.*|>+<JJ.*>*}"
RULE_JJDTNN
=
"{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}"
RULE_TINA
=
"^((VBD,|VBG,|VBN,|CD.?,|JJ.?,|
\
?,){0,2}?(N.?.?,|
\
?,)+?(CD.,)??)
\
+?((PREP.?|DET.?,|IN.?,|CC.?,|
\
?,)((VBD,|VBG,|VBN,|CD.?,|JJ.?,|
\
?
\
,){0,2}?(N.?.?,|
\
?,)+?)+?)*?$"
# ngram lists import/export parameters -----------------------------------------
DEFAULT_CSV_DELIM
=
'
\t
'
# for import/export CSV defaults
DEFAULT_CSV_DELIM_GROUP
=
'|&|'
# ------------------------------------------------------------------------------
# other parameters
# default number of docs POSTed to scrappers.views.py
# (at page project > add a corpus > scan/process sample)
QUERY_SIZE_N_DEFAULT
=
1000
# Files ----------------------------------------------------------------
import
os
from
.settings
import
BASE_DIR
# uploads/.gitignore prevents corpora indexing
...
...
@@ -278,7 +331,7 @@ UPLOAD_DIRECTORY = os.path.join(BASE_DIR, 'uploads/corpora')
UPLOAD_LIMIT
=
1024
*
1024
*
1024
DOWNLOAD_DIRECTORY
=
UPLOAD_DIRECTORY
# Processing -----------------------------------------------------------
# about batch processing...
BATCH_PARSING_SIZE
=
256
BATCH_NGRAMSEXTRACTION_SIZE
=
3000
# how many distinct ngrams before INTEGRATE
...
...
gargantext/util/crawlers/__init__.py
0 → 100644
View file @
6740e4df
import
importlib
from
gargantext.constants
import
RESOURCETYPES
from
gargantext.settings
import
DEBUG
#if DEBUG: print("Loading available Crawlers")
base_parser
=
"gargantext.util.crawlers"
for
resource
in
RESOURCETYPES
:
if
resource
[
"crawler"
]
is
not
None
:
try
:
name
=
resource
[
"crawler"
]
#crawler is type basename+"Crawler"
filename
=
name
.
replace
(
"Crawler"
,
""
)
.
lower
()
module
=
base_parser
+
".
%
s"
%
(
filename
)
importlib
.
import_module
(
module
)
#if DEBUG: print("\t-", name)
except
Exception
as
e
:
print
(
"Check constants.py RESOURCETYPES declaration
%
s
\n
CRAWLER
%
s is not available for
%
s"
%
(
str
(
e
),
resource
[
"crawler"
],
resource
[
"name"
]))
#initial import
#from .cern import CernCrawler
#from .istex import ISTexCrawler
#from .pubmed import PubmedCrawler
gargantext/util/taggers/__init__.py
View file @
6740e4df
from
.TurboTagger
import
TurboTagger
from
.NltkTagger
import
NltkTagger
from
.TreeTagger
import
TreeTagger
from
.MeltTagger
import
EnglishMeltTagger
,
FrenchMeltTagger
#version2
#imported as needed
#Version 1
#~ import importlib
#~ from gargantext.constants import LANGUAGES
#~ from gargantext.settings import DEBUG
#~ if DEBUG:
#~ print("Loading available Taggers:")
#~ for lang, tagger in LANGUAGES.items():
#~ tagger = tagger["tagger"]
#~ filename = "gargantext.util.taggers.%s" %(tagger)
#~ if DEBUG:
#~ print("\t-%s (%s)" %(tagger, lang))
#~ getattr(importlib.import_module(filename), tagger)()
#VERSION 0
#~ #initally a manual import declaration
#~ from .TurboTagger import TurboTagger
#~ from .NltkTagger import NltkTagger
#~ from .TreeTagger import TreeTagger
#~ from .MeltTagger import EnglishMeltTagger, FrenchMeltTagger
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment