Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
098ec535
Commit
098ec535
authored
Jun 28, 2017
by
sim
Committed by
sim
Jul 10, 2017
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[REFACT] Move nodes related constants & funcs in a separated file
parent
f8aa5546
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
288 additions
and
261 deletions
+288
-261
constants.py
gargantext/constants.py
+6
-259
nodes.py
gargantext/models/nodes.py
+6
-2
nodes_constants.py
gargantext/models/nodes_constants.py
+276
-0
No files found.
gargantext/constants.py
View file @
098ec535
"""
# WARNING: to ensure consistency and retrocompatibility, lists should keep the
# initial order (ie., new elements should be appended at the end of the lists)
abstract:
---------
...
...
@@ -9,10 +7,6 @@ abstract:
contents:
---------
+ database constants/ontology
- nodetypes
(db int <=> named types <=> python code)
+ low-level limits
- query size
- max upload size
...
...
@@ -20,150 +14,29 @@ contents:
- word extraction batch size
+ main process config
- resourcetypes config (~ input ontology)
- wordlist generation params
- graph creation params
- £TODO sequence of transformations "custom pipeline"
+ subprocess config
- crawling, import
- tagger services and functions
- parser services
- stemmer services
"""
import
os
import
re
import
importlib
from
gargantext.util.lists
import
WeightedList
,
UnweightedList
,
WeightedIndex
,
\
WeightedMatrix
,
Translations
from
gargantext.util.dates
import
datetime
,
convert_to_datetime
from
.settings
import
BASE_DIR
# types & models (nodes, lists, hyperdata, resource) ---------------------------------------------
LISTTYPES
=
{
'DOCUMENT'
:
WeightedList
,
'GROUPLIST'
:
Translations
,
# todo remove "LIST" from name
'STOPLIST'
:
UnweightedList
,
'MAINLIST'
:
UnweightedList
,
'MAPLIST'
:
UnweightedList
,
'SPECCLUSION'
:
WeightedList
,
'GENCLUSION'
:
WeightedList
,
'OCCURRENCES'
:
WeightedIndex
,
# could be WeightedList
'COOCCURRENCES'
:
WeightedMatrix
,
'TFIDF-CORPUS'
:
WeightedIndex
,
'TFIDF-GLOBAL'
:
WeightedIndex
,
'TIRANK-LOCAL'
:
WeightedIndex
,
# could be WeightedList
}
# 'OWNLIST' : UnweightedList, # £TODO use this for any term-level tags
NODETYPES
=
[
# TODO separate id not array index, read by models.node
None
,
# 0
# documents hierarchy
'USER'
,
# 1
'PROJECT'
,
# 2
#RESOURCE should be here but last
'CORPUS'
,
# 3
'DOCUMENT'
,
# 4
# lists
'STOPLIST'
,
# 5
'GROUPLIST'
,
# 6
'MAINLIST'
,
# 7
'MAPLIST'
,
# 8
'COOCCURRENCES'
,
# 9
# scores
'OCCURRENCES'
,
# 10
'SPECCLUSION'
,
# 11
'CVALUE'
,
# 12
'TFIDF-CORPUS'
,
# 13
'TFIDF-GLOBAL'
,
# 14
# docs subset
'FAVORITES'
,
# 15
# more scores (sorry!)
'TIRANK-LOCAL'
,
# 16
'TIRANK-GLOBAL'
,
# 17
'GENCLUSION'
,
# 18
'RESOURCE'
,
# 19
]
INDEXED_HYPERDATA
=
{
# TODO use properties during toolchain.hyperdata_indexing
# (type, convert_to_db, convert_from_db)
'count'
:
{
'id'
:
1
,
'type'
:
int
,
'convert_to_db'
:
int
,
'convert_from_db'
:
int
},
'publication_date'
:
{
'id'
:
2
,
'type'
:
datetime
,
'convert_to_db'
:
convert_to_datetime
,
'convert_from_db'
:
convert_to_datetime
},
'title'
:
{
'id'
:
3
,
'type'
:
str
,
'convert_to_db'
:
str
,
'convert_from_db'
:
str
},
'authors'
:
{
'id'
:
4
,
'type'
:
str
,
'convert_to_db'
:
str
,
'convert_from_db'
:
str
},
'journal'
:
{
'id'
:
5
,
'type'
:
str
,
'convert_to_db'
:
str
,
'convert_from_db'
:
str
},
'abstract'
:
{
'id'
:
6
,
'type'
:
str
,
'convert_to_db'
:
str
,
'convert_from_db'
:
str
},
# 'text':
# { 'id' : 7
# , 'type' : str
# , 'convert_to_db' : str
# , 'convert_from_db': str
# },
#
# 'page':
# { 'id' : 8
# , 'type' : int
# , 'convert_to_db' : int
# , 'convert_from_db': int
# },
# XXX Originally defined here, imported here for backward-compatibility,
# should be removed later.
from
.models.nodes_constants
import
NODETYPES
,
LISTTYPES
,
INDEXED_HYPERDATA
,
\
RESOURCETYPES
,
get_resource
,
get_resource_by_name
,
\
load_parser
,
load_crawler
}
# user parameters----------------------------------------
USER_LANG
=
[
"fr"
,
"en"
]
# resources ---------------------------------------------
def
get_resource
(
sourcetype
):
'''resource :: type => resource dict'''
return
RESOURCETYPES
[
sourcetype
-
1
]
def
get_resource_by_name
(
sourcename
):
'''resource :: name => resource dict'''
for
n
in
RESOURCETYPES
:
if
str
(
n
[
"name"
])
==
str
(
sourcename
):
return
n
# taggers -----------------------------------------------
def
get_tagger
(
lang
):
'''
...
...
@@ -176,132 +49,6 @@ def get_tagger(lang):
return
tagger
()
RESOURCETYPES
=
[
{
"type"
:
1
,
'name'
:
'Europresse'
,
'format'
:
'Europresse'
,
'parser'
:
"EuropresseParser"
,
'file_formats'
:[
"zip"
,
"txt"
],
'crawler'
:
None
,
},
{
'type'
:
2
,
'name'
:
'Jstor [RIS]'
,
'format'
:
'RIS'
,
'parser'
:
"RISParser"
,
'file_formats'
:[
"zip"
,
"txt"
],
'crawler'
:
None
,
},
{
'type'
:
3
,
'name'
:
'Pubmed [XML]'
,
'format'
:
'Pubmed'
,
'parser'
:
"PubmedParser"
,
'file_formats'
:[
"zip"
,
"xml"
],
'crawler'
:
"PubmedCrawler"
,
},
{
'type'
:
4
,
'name'
:
'Scopus [RIS]'
,
'format'
:
'RIS'
,
'parser'
:
"RISParser"
,
'file_formats'
:[
"zip"
,
"txt"
],
'crawler'
:
None
,
},
{
'type'
:
5
,
'name'
:
'Web of Science [ISI]'
,
'format'
:
'ISI'
,
'parser'
:
"ISIParser"
,
'file_formats'
:[
"zip"
,
"txt"
,
"isi"
],
#'crawler': "ISICrawler",
'crawler'
:
None
,
},
{
'type'
:
6
,
'name'
:
'Zotero [RIS]'
,
'format'
:
'RIS'
,
'parser'
:
'RISParser'
,
'file_formats'
:[
"zip"
,
"ris"
,
"txt"
],
'crawler'
:
None
,
},
{
'type'
:
7
,
'name'
:
'CSV'
,
'format'
:
'CSV'
,
'parser'
:
'CSVParser'
,
'file_formats'
:[
"zip"
,
"csv"
],
'crawler'
:
None
,
},
{
'type'
:
8
,
'name'
:
'ISTex'
,
'format'
:
'json'
,
'parser'
:
"ISTexParser"
,
'file_formats'
:[
"zip"
,
"txt"
],
'crawler'
:
None
,
},
{
"type"
:
9
,
"name"
:
'SCOAP [API/XML]'
,
"parser"
:
"CernParser"
,
"format"
:
'MARC21'
,
'file_formats'
:[
"zip"
,
"xml"
],
"crawler"
:
"CernCrawler"
,
},
# { "type": 10,
# "name": 'REPEC [RIS]',
# "parser": "RISParser",
# "format": 'RIS',
# 'file_formats':["zip","ris", "txt"],
# "crawler": None,
# },
#
{
"type"
:
10
,
"name"
:
'REPEC [MULTIVAC API]'
,
"parser"
:
"MultivacParser"
,
"format"
:
'JSON'
,
'file_formats'
:[
"zip"
,
"json"
],
"crawler"
:
"MultivacCrawler"
,
},
{
"type"
:
11
,
"name"
:
'HAL [API]'
,
"parser"
:
"HalParser"
,
"format"
:
'JSON'
,
'file_formats'
:[
"zip"
,
"json"
],
"crawler"
:
"HalCrawler"
,
},
{
"type"
:
12
,
"name"
:
'ISIDORE [SPARQLE API /!
\
BETA]'
,
"parser"
:
"IsidoreParser"
,
"format"
:
'JSON'
,
'file_formats'
:[
"zip"
,
"json"
],
"crawler"
:
"IsidoreCrawler"
,
},
]
def
load_parser
(
resource
):
'''given a resource load the corresponding Parser
resource(dict) > Parser(object)
exemple with resource ISTexParser
PARSER filename: ISTEX
PARSER object: ISTexParser
'''
filename
=
resource
[
"parser"
]
.
replace
(
"Parser"
,
''
)
.
upper
()
module
=
'gargantext.util.parsers.
%
s'
%
(
filename
)
module
=
importlib
.
import_module
(
module
)
return
getattr
(
module
,
resource
[
"parser"
])
def
load_crawler
(
resource
):
'''given a resource load the corresponding Parser()
resource(dict) > Parser(object)
exemple with resource ISTexCrawler
PARSER filename: ISTEX
PARSER object: ISTexCrawler
'''
filename
=
resource
[
"crawler"
]
.
replace
(
"Crawler"
,
""
)
.
upper
()
module
=
'gargantext.util.crawlers.
%
s'
%
(
filename
)
module
=
importlib
.
import_module
(
module
)
return
getattr
(
module
,
resource
[
"crawler"
])
# Supported languages and taggers ---------------------------------------------
#first declare the tagger using a string
#and it will be imported into gargantext.utils.taggers
...
...
@@ -329,7 +76,6 @@ def load_tagger(lang):
return
getattr
(
module
,
filename
)()
# linguistic extraction parameters ---------------------------------------------
DEFAULT_RANK_CUTOFF_RATIO
=
.75
# MAINLIST maximum terms in %
...
...
@@ -367,6 +113,7 @@ DEFAULT_INDEX_SUBGRAMS = False # False <=> traditional
# "cool example".
# (all 1 to n-1 length ngrams,
# at indexing after extraction)
# TAGGING options -----------------------------------------
#activate lang detection?
DETECT_LANG
=
False
...
...
gargantext/models/nodes.py
View file @
098ec535
from
gargantext.util.db
import
session
from
gargantext.util.files
import
upload
from
gargantext.constants
import
*
from
datetime
import
datetime
...
...
@@ -8,8 +7,11 @@ from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index,
Integer
,
Float
,
String
,
DateTime
,
JSONB
,
\
MutableList
,
MutableDict
from
.users
import
User
from
.nodes_constants
import
NODETYPES
,
LISTTYPES
,
INDEXED_HYPERDATA
__all__
=
[
'NODETYPES'
,
'LISTTYPES'
,
'INDEXED_HYPERDATA'
,
'Node'
,
'NodeNode'
,
'CorpusNode'
]
__all__
=
[
'Node'
,
'NodeNode'
,
'CorpusNode'
]
class
NodeType
(
TypeDecorator
):
"""Define a new type of column to describe a Node's type.
...
...
@@ -17,8 +19,10 @@ class NodeType(TypeDecorator):
Values are detailed in `gargantext.constants.NODETYPES`.
"""
impl
=
Integer
def
process_bind_param
(
self
,
typename
,
dialect
):
return
NODETYPES
.
index
(
typename
)
def
process_result_value
(
self
,
typeindex
,
dialect
):
return
NODETYPES
[
typeindex
]
...
...
gargantext/models/nodes_constants.py
0 → 100644
View file @
098ec535
"""
# WARNING: to ensure consistency and retrocompatibility, lists should keep the
# initial order (ie., new elements should be appended at the end of the lists)
abstract:
---------
constants and utility functions related to nodes
initially in gargantext.constants
contents:
---------
+ database constants/ontology
- nodetypes
(db int <=> named types <=> python code)
+ main process config
- resourcetypes config (~ input ontology)
+ subprocess config
- crawling, import
- parser services
"""
import
importlib
from
..util.lists
import
WeightedList
,
UnweightedList
,
WeightedIndex
,
\
WeightedMatrix
,
Translations
from
..util.dates
import
datetime
,
convert_to_datetime
# types & models (nodes, lists, hyperdata, resource) ---------------------------------------------
NODETYPES
=
[
# TODO separate id not array index, read by models.node
None
,
# 0
# documents hierarchy
'USER'
,
# 1
'PROJECT'
,
# 2
#RESOURCE should be here but last
'CORPUS'
,
# 3
'DOCUMENT'
,
# 4
# lists
'STOPLIST'
,
# 5
'GROUPLIST'
,
# 6
'MAINLIST'
,
# 7
'MAPLIST'
,
# 8
'COOCCURRENCES'
,
# 9
# scores
'OCCURRENCES'
,
# 10
'SPECCLUSION'
,
# 11
'CVALUE'
,
# 12
'TFIDF-CORPUS'
,
# 13
'TFIDF-GLOBAL'
,
# 14
# docs subset
'FAVORITES'
,
# 15
# more scores (sorry!)
'TIRANK-LOCAL'
,
# 16
'TIRANK-GLOBAL'
,
# 17
'GENCLUSION'
,
# 18
'RESOURCE'
,
# 19
]
LISTTYPES
=
{
'DOCUMENT'
:
WeightedList
,
'GROUPLIST'
:
Translations
,
# todo remove "LIST" from name
'STOPLIST'
:
UnweightedList
,
'MAINLIST'
:
UnweightedList
,
'MAPLIST'
:
UnweightedList
,
'SPECCLUSION'
:
WeightedList
,
'GENCLUSION'
:
WeightedList
,
'OCCURRENCES'
:
WeightedIndex
,
# could be WeightedList
'COOCCURRENCES'
:
WeightedMatrix
,
'TFIDF-CORPUS'
:
WeightedIndex
,
'TFIDF-GLOBAL'
:
WeightedIndex
,
'TIRANK-LOCAL'
:
WeightedIndex
,
# could be WeightedList
}
# 'OWNLIST' : UnweightedList, # £TODO use this for any term-level tags
INDEXED_HYPERDATA
=
{
# TODO use properties during toolchain.hyperdata_indexing
# (type, convert_to_db, convert_from_db)
'count'
:
{
'id'
:
1
,
'type'
:
int
,
'convert_to_db'
:
int
,
'convert_from_db'
:
int
},
'publication_date'
:
{
'id'
:
2
,
'type'
:
datetime
,
'convert_to_db'
:
convert_to_datetime
,
'convert_from_db'
:
convert_to_datetime
},
'title'
:
{
'id'
:
3
,
'type'
:
str
,
'convert_to_db'
:
str
,
'convert_from_db'
:
str
},
'authors'
:
{
'id'
:
4
,
'type'
:
str
,
'convert_to_db'
:
str
,
'convert_from_db'
:
str
},
'journal'
:
{
'id'
:
5
,
'type'
:
str
,
'convert_to_db'
:
str
,
'convert_from_db'
:
str
},
'abstract'
:
{
'id'
:
6
,
'type'
:
str
,
'convert_to_db'
:
str
,
'convert_from_db'
:
str
},
# 'text':
# { 'id' : 7
# , 'type' : str
# , 'convert_to_db' : str
# , 'convert_from_db': str
# },
#
# 'page':
# { 'id' : 8
# , 'type' : int
# , 'convert_to_db' : int
# , 'convert_from_db': int
# },
}
# resources ---------------------------------------------
def
get_resource
(
sourcetype
):
'''resource :: type => resource dict'''
return
RESOURCETYPES
[
sourcetype
-
1
]
def
get_resource_by_name
(
sourcename
):
'''resource :: name => resource dict'''
for
n
in
RESOURCETYPES
:
if
str
(
n
[
"name"
])
==
str
(
sourcename
):
return
n
RESOURCETYPES
=
[
{
"type"
:
1
,
'name'
:
'Europresse'
,
'format'
:
'Europresse'
,
'parser'
:
"EuropresseParser"
,
'file_formats'
:[
"zip"
,
"txt"
],
'crawler'
:
None
,
},
{
'type'
:
2
,
'name'
:
'Jstor [RIS]'
,
'format'
:
'RIS'
,
'parser'
:
"RISParser"
,
'file_formats'
:[
"zip"
,
"txt"
],
'crawler'
:
None
,
},
{
'type'
:
3
,
'name'
:
'Pubmed [XML]'
,
'format'
:
'Pubmed'
,
'parser'
:
"PubmedParser"
,
'file_formats'
:[
"zip"
,
"xml"
],
'crawler'
:
"PubmedCrawler"
,
},
{
'type'
:
4
,
'name'
:
'Scopus [RIS]'
,
'format'
:
'RIS'
,
'parser'
:
"RISParser"
,
'file_formats'
:[
"zip"
,
"txt"
],
'crawler'
:
None
,
},
{
'type'
:
5
,
'name'
:
'Web of Science [ISI]'
,
'format'
:
'ISI'
,
'parser'
:
"ISIParser"
,
'file_formats'
:[
"zip"
,
"txt"
,
"isi"
],
#'crawler': "ISICrawler",
'crawler'
:
None
,
},
{
'type'
:
6
,
'name'
:
'Zotero [RIS]'
,
'format'
:
'RIS'
,
'parser'
:
'RISParser'
,
'file_formats'
:[
"zip"
,
"ris"
,
"txt"
],
'crawler'
:
None
,
},
{
'type'
:
7
,
'name'
:
'CSV'
,
'format'
:
'CSV'
,
'parser'
:
'CSVParser'
,
'file_formats'
:[
"zip"
,
"csv"
],
'crawler'
:
None
,
},
{
'type'
:
8
,
'name'
:
'ISTex'
,
'format'
:
'json'
,
'parser'
:
"ISTexParser"
,
'file_formats'
:[
"zip"
,
"txt"
],
'crawler'
:
None
,
},
{
"type"
:
9
,
"name"
:
'SCOAP [API/XML]'
,
"parser"
:
"CernParser"
,
"format"
:
'MARC21'
,
'file_formats'
:[
"zip"
,
"xml"
],
"crawler"
:
"CernCrawler"
,
},
# { "type": 10,
# "name": 'REPEC [RIS]',
# "parser": "RISParser",
# "format": 'RIS',
# 'file_formats':["zip","ris", "txt"],
# "crawler": None,
# },
#
{
"type"
:
10
,
"name"
:
'REPEC [MULTIVAC API]'
,
"parser"
:
"MultivacParser"
,
"format"
:
'JSON'
,
'file_formats'
:[
"zip"
,
"json"
],
"crawler"
:
"MultivacCrawler"
,
},
{
"type"
:
11
,
"name"
:
'HAL [API]'
,
"parser"
:
"HalParser"
,
"format"
:
'JSON'
,
'file_formats'
:[
"zip"
,
"json"
],
"crawler"
:
"HalCrawler"
,
},
{
"type"
:
12
,
"name"
:
'ISIDORE [SPARQLE API /!
\
BETA]'
,
"parser"
:
"IsidoreParser"
,
"format"
:
'JSON'
,
'file_formats'
:[
"zip"
,
"json"
],
"crawler"
:
"IsidoreCrawler"
,
},
]
def
load_parser
(
resource
):
'''given a resource load the corresponding Parser
resource(dict) > Parser(object)
exemple with resource ISTexParser
PARSER filename: ISTEX
PARSER object: ISTexParser
'''
filename
=
resource
[
"parser"
]
.
replace
(
"Parser"
,
''
)
.
upper
()
module
=
'gargantext.util.parsers.
%
s'
%
(
filename
)
module
=
importlib
.
import_module
(
module
)
return
getattr
(
module
,
resource
[
"parser"
])
def
load_crawler
(
resource
):
'''given a resource load the corresponding Parser()
resource(dict) > Parser(object)
exemple with resource ISTexCrawler
PARSER filename: ISTEX
PARSER object: ISTexCrawler
'''
filename
=
resource
[
"crawler"
]
.
replace
(
"Crawler"
,
""
)
.
upper
()
module
=
'gargantext.util.crawlers.
%
s'
%
(
filename
)
module
=
importlib
.
import_module
(
module
)
return
getattr
(
module
,
resource
[
"crawler"
])
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment