Commit 098ec535 authored by sim's avatar sim Committed by sim

[REFACT] Move nodes related constants & funcs in a separated file

parent f8aa5546
"""
# WARNING: to ensure consistency and retrocompatibility, lists should keep the
# initial order (ie., new elements should be appended at the end of the lists)
abstract:
---------
......@@ -9,10 +7,6 @@ abstract:
contents:
---------
+ database constants/ontology
- nodetypes
(db int <=> named types <=> python code)
+ low-level limits
- query size
- max upload size
......@@ -20,150 +14,29 @@ contents:
- word extraction batch size
+ main process config
- resourcetypes config (~ input ontology)
- wordlist generation params
- graph creation params
- £TODO sequence of transformations "custom pipeline"
+ subprocess config
- crawling, import
- tagger services and functions
- parser services
- stemmer services
"""
import os
import re
import importlib
from gargantext.util.lists import WeightedList, UnweightedList, WeightedIndex, \
WeightedMatrix, Translations
from gargantext.util.dates import datetime, convert_to_datetime
from .settings import BASE_DIR
# types & models (nodes, lists, hyperdata, resource) ---------------------------------------------
LISTTYPES = {
'DOCUMENT' : WeightedList,
'GROUPLIST' : Translations, # todo remove "LIST" from name
'STOPLIST' : UnweightedList,
'MAINLIST' : UnweightedList,
'MAPLIST' : UnweightedList,
'SPECCLUSION' : WeightedList,
'GENCLUSION' : WeightedList,
'OCCURRENCES' : WeightedIndex, # could be WeightedList
'COOCCURRENCES': WeightedMatrix,
'TFIDF-CORPUS' : WeightedIndex,
'TFIDF-GLOBAL' : WeightedIndex,
'TIRANK-LOCAL' : WeightedIndex, # could be WeightedList
}
# 'OWNLIST' : UnweightedList, # £TODO use this for any term-level tags
NODETYPES = [
# TODO separate id not array index, read by models.node
None, # 0
# documents hierarchy
'USER', # 1
'PROJECT', # 2
#RESOURCE should be here but last
'CORPUS', # 3
'DOCUMENT', # 4
# lists
'STOPLIST', # 5
'GROUPLIST', # 6
'MAINLIST', # 7
'MAPLIST', # 8
'COOCCURRENCES', # 9
# scores
'OCCURRENCES', # 10
'SPECCLUSION', # 11
'CVALUE', # 12
'TFIDF-CORPUS', # 13
'TFIDF-GLOBAL', # 14
# docs subset
'FAVORITES', # 15
# more scores (sorry!)
'TIRANK-LOCAL', # 16
'TIRANK-GLOBAL', # 17
'GENCLUSION', # 18
'RESOURCE', # 19
]
INDEXED_HYPERDATA = {
# TODO use properties during toolchain.hyperdata_indexing
# (type, convert_to_db, convert_from_db)
'count':
{ 'id' : 1
, 'type' : int
, 'convert_to_db' : int
, 'convert_from_db': int
},
'publication_date':
{ 'id' : 2
, 'type' : datetime
, 'convert_to_db' : convert_to_datetime
, 'convert_from_db': convert_to_datetime
},
'title':
{ 'id' : 3
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
'authors':
{ 'id' : 4
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
'journal':
{ 'id' : 5
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
'abstract':
{ 'id' : 6
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
# 'text':
# { 'id' : 7
# , 'type' : str
# , 'convert_to_db' : str
# , 'convert_from_db': str
# },
#
# 'page':
# { 'id' : 8
# , 'type' : int
# , 'convert_to_db' : int
# , 'convert_from_db': int
# },
# XXX Originally defined here, imported here for backward-compatibility,
# should be removed later.
from .models.nodes_constants import NODETYPES, LISTTYPES, INDEXED_HYPERDATA, \
RESOURCETYPES, get_resource, get_resource_by_name, \
load_parser, load_crawler
}
# user parameters----------------------------------------
USER_LANG = ["fr", "en"]
# resources ---------------------------------------------
def get_resource(sourcetype):
'''resource :: type => resource dict'''
return RESOURCETYPES[sourcetype-1]
def get_resource_by_name(sourcename):
'''resource :: name => resource dict'''
for n in RESOURCETYPES:
if str(n["name"]) == str(sourcename):
return n
# taggers -----------------------------------------------
def get_tagger(lang):
'''
......@@ -176,132 +49,6 @@ def get_tagger(lang):
return tagger()
RESOURCETYPES = [
{ "type": 1,
'name': 'Europresse',
'format': 'Europresse',
'parser': "EuropresseParser",
'file_formats':["zip", "txt"],
'crawler': None,
},
{ 'type': 2,
'name': 'Jstor [RIS]',
'format': 'RIS',
'parser': "RISParser",
'file_formats':["zip", "txt"],
'crawler': None,
},
{ 'type': 3,
'name': 'Pubmed [XML]',
'format': 'Pubmed',
'parser': "PubmedParser",
'file_formats':["zip", "xml"],
'crawler': "PubmedCrawler",
},
{ 'type': 4,
'name': 'Scopus [RIS]',
'format': 'RIS',
'parser': "RISParser",
'file_formats':["zip", "txt"],
'crawler': None,
},
{ 'type': 5,
'name': 'Web of Science [ISI]',
'format': 'ISI',
'parser': "ISIParser",
'file_formats':["zip", "txt", "isi"],
#'crawler': "ISICrawler",
'crawler': None,
},
{ 'type': 6,
'name': 'Zotero [RIS]',
'format': 'RIS',
'parser': 'RISParser',
'file_formats':["zip", "ris", "txt"],
'crawler': None,
},
{ 'type': 7,
'name': 'CSV',
'format': 'CSV',
'parser': 'CSVParser',
'file_formats':["zip", "csv"],
'crawler': None,
},
{ 'type': 8,
'name': 'ISTex',
'format': 'json',
'parser': "ISTexParser",
'file_formats':["zip", "txt"],
'crawler': None,
},
{ "type": 9,
"name": 'SCOAP [API/XML]',
"parser": "CernParser",
"format": 'MARC21',
'file_formats':["zip","xml"],
"crawler": "CernCrawler",
},
# { "type": 10,
# "name": 'REPEC [RIS]',
# "parser": "RISParser",
# "format": 'RIS',
# 'file_formats':["zip","ris", "txt"],
# "crawler": None,
# },
#
{ "type": 10,
"name": 'REPEC [MULTIVAC API]',
"parser": "MultivacParser",
"format": 'JSON',
'file_formats':["zip","json"],
"crawler": "MultivacCrawler",
},
{ "type": 11,
"name": 'HAL [API]',
"parser": "HalParser",
"format": 'JSON',
'file_formats':["zip","json"],
"crawler": "HalCrawler",
},
{ "type": 12,
"name": 'ISIDORE [SPARQLE API /!\ BETA]',
"parser": "IsidoreParser",
"format": 'JSON',
'file_formats':["zip","json"],
"crawler": "IsidoreCrawler",
},
]
def load_parser(resource):
'''given a resource load the corresponding Parser
resource(dict) > Parser(object)
exemple with resource ISTexParser
PARSER filename: ISTEX
PARSER object: ISTexParser
'''
filename = resource["parser"].replace("Parser", '').upper()
module = 'gargantext.util.parsers.%s' %(filename)
module = importlib.import_module(module)
return getattr(module, resource["parser"])
def load_crawler(resource):
'''given a resource load the corresponding Parser()
resource(dict) > Parser(object)
exemple with resource ISTexCrawler
PARSER filename: ISTEX
PARSER object: ISTexCrawler
'''
filename = resource["crawler"].replace("Crawler", "").upper()
module = 'gargantext.util.crawlers.%s' %(filename)
module = importlib.import_module(module)
return getattr(module, resource["crawler"])
# Supported languages and taggers ---------------------------------------------
#first declare the tagger using a string
#and it will be imported into gargantext.utils.taggers
......@@ -329,7 +76,6 @@ def load_tagger(lang):
return getattr(module, filename)()
# linguistic extraction parameters ---------------------------------------------
DEFAULT_RANK_CUTOFF_RATIO = .75 # MAINLIST maximum terms in %
......@@ -367,6 +113,7 @@ DEFAULT_INDEX_SUBGRAMS = False # False <=> traditional
# "cool example".
# (all 1 to n-1 length ngrams,
# at indexing after extraction)
# TAGGING options -----------------------------------------
#activate lang detection?
DETECT_LANG = False
......
from gargantext.util.db import session
from gargantext.util.files import upload
from gargantext.constants import *
from datetime import datetime
......@@ -8,8 +7,11 @@ from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index,
Integer, Float, String, DateTime, JSONB, \
MutableList, MutableDict
from .users import User
from .nodes_constants import NODETYPES, LISTTYPES, INDEXED_HYPERDATA
__all__ = ['NODETYPES', 'LISTTYPES', 'INDEXED_HYPERDATA',
'Node', 'NodeNode', 'CorpusNode']
__all__ = ['Node', 'NodeNode', 'CorpusNode']
class NodeType(TypeDecorator):
"""Define a new type of column to describe a Node's type.
......@@ -17,8 +19,10 @@ class NodeType(TypeDecorator):
Values are detailed in `gargantext.constants.NODETYPES`.
"""
impl = Integer
def process_bind_param(self, typename, dialect):
return NODETYPES.index(typename)
def process_result_value(self, typeindex, dialect):
return NODETYPES[typeindex]
......
"""
# WARNING: to ensure consistency and retrocompatibility, lists should keep the
# initial order (ie., new elements should be appended at the end of the lists)
abstract:
---------
constants and utility functions related to nodes
initially in gargantext.constants
contents:
---------
+ database constants/ontology
- nodetypes
(db int <=> named types <=> python code)
+ main process config
- resourcetypes config (~ input ontology)
+ subprocess config
- crawling, import
- parser services
"""
import importlib
from ..util.lists import WeightedList, UnweightedList, WeightedIndex, \
WeightedMatrix, Translations
from ..util.dates import datetime, convert_to_datetime
# types & models (nodes, lists, hyperdata, resource) ---------------------------------------------
NODETYPES = [
# TODO separate id not array index, read by models.node
None, # 0
# documents hierarchy
'USER', # 1
'PROJECT', # 2
#RESOURCE should be here but last
'CORPUS', # 3
'DOCUMENT', # 4
# lists
'STOPLIST', # 5
'GROUPLIST', # 6
'MAINLIST', # 7
'MAPLIST', # 8
'COOCCURRENCES', # 9
# scores
'OCCURRENCES', # 10
'SPECCLUSION', # 11
'CVALUE', # 12
'TFIDF-CORPUS', # 13
'TFIDF-GLOBAL', # 14
# docs subset
'FAVORITES', # 15
# more scores (sorry!)
'TIRANK-LOCAL', # 16
'TIRANK-GLOBAL', # 17
'GENCLUSION', # 18
'RESOURCE', # 19
]
LISTTYPES = {
'DOCUMENT' : WeightedList,
'GROUPLIST' : Translations, # todo remove "LIST" from name
'STOPLIST' : UnweightedList,
'MAINLIST' : UnweightedList,
'MAPLIST' : UnweightedList,
'SPECCLUSION' : WeightedList,
'GENCLUSION' : WeightedList,
'OCCURRENCES' : WeightedIndex, # could be WeightedList
'COOCCURRENCES': WeightedMatrix,
'TFIDF-CORPUS' : WeightedIndex,
'TFIDF-GLOBAL' : WeightedIndex,
'TIRANK-LOCAL' : WeightedIndex, # could be WeightedList
}
# 'OWNLIST' : UnweightedList, # £TODO use this for any term-level tags
INDEXED_HYPERDATA = {
# TODO use properties during toolchain.hyperdata_indexing
# (type, convert_to_db, convert_from_db)
'count':
{ 'id' : 1
, 'type' : int
, 'convert_to_db' : int
, 'convert_from_db': int
},
'publication_date':
{ 'id' : 2
, 'type' : datetime
, 'convert_to_db' : convert_to_datetime
, 'convert_from_db': convert_to_datetime
},
'title':
{ 'id' : 3
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
'authors':
{ 'id' : 4
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
'journal':
{ 'id' : 5
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
'abstract':
{ 'id' : 6
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
# 'text':
# { 'id' : 7
# , 'type' : str
# , 'convert_to_db' : str
# , 'convert_from_db': str
# },
#
# 'page':
# { 'id' : 8
# , 'type' : int
# , 'convert_to_db' : int
# , 'convert_from_db': int
# },
}
# resources ---------------------------------------------
def get_resource(sourcetype):
'''resource :: type => resource dict'''
return RESOURCETYPES[sourcetype-1]
def get_resource_by_name(sourcename):
'''resource :: name => resource dict'''
for n in RESOURCETYPES:
if str(n["name"]) == str(sourcename):
return n
RESOURCETYPES = [
{ "type": 1,
'name': 'Europresse',
'format': 'Europresse',
'parser': "EuropresseParser",
'file_formats':["zip", "txt"],
'crawler': None,
},
{ 'type': 2,
'name': 'Jstor [RIS]',
'format': 'RIS',
'parser': "RISParser",
'file_formats':["zip", "txt"],
'crawler': None,
},
{ 'type': 3,
'name': 'Pubmed [XML]',
'format': 'Pubmed',
'parser': "PubmedParser",
'file_formats':["zip", "xml"],
'crawler': "PubmedCrawler",
},
{ 'type': 4,
'name': 'Scopus [RIS]',
'format': 'RIS',
'parser': "RISParser",
'file_formats':["zip", "txt"],
'crawler': None,
},
{ 'type': 5,
'name': 'Web of Science [ISI]',
'format': 'ISI',
'parser': "ISIParser",
'file_formats':["zip", "txt", "isi"],
#'crawler': "ISICrawler",
'crawler': None,
},
{ 'type': 6,
'name': 'Zotero [RIS]',
'format': 'RIS',
'parser': 'RISParser',
'file_formats':["zip", "ris", "txt"],
'crawler': None,
},
{ 'type': 7,
'name': 'CSV',
'format': 'CSV',
'parser': 'CSVParser',
'file_formats':["zip", "csv"],
'crawler': None,
},
{ 'type': 8,
'name': 'ISTex',
'format': 'json',
'parser': "ISTexParser",
'file_formats':["zip", "txt"],
'crawler': None,
},
{ "type": 9,
"name": 'SCOAP [API/XML]',
"parser": "CernParser",
"format": 'MARC21',
'file_formats':["zip","xml"],
"crawler": "CernCrawler",
},
# { "type": 10,
# "name": 'REPEC [RIS]',
# "parser": "RISParser",
# "format": 'RIS',
# 'file_formats':["zip","ris", "txt"],
# "crawler": None,
# },
#
{ "type": 10,
"name": 'REPEC [MULTIVAC API]',
"parser": "MultivacParser",
"format": 'JSON',
'file_formats':["zip","json"],
"crawler": "MultivacCrawler",
},
{ "type": 11,
"name": 'HAL [API]',
"parser": "HalParser",
"format": 'JSON',
'file_formats':["zip","json"],
"crawler": "HalCrawler",
},
{ "type": 12,
"name": 'ISIDORE [SPARQLE API /!\ BETA]',
"parser": "IsidoreParser",
"format": 'JSON',
'file_formats':["zip","json"],
"crawler": "IsidoreCrawler",
},
]
def load_parser(resource):
'''given a resource load the corresponding Parser
resource(dict) > Parser(object)
exemple with resource ISTexParser
PARSER filename: ISTEX
PARSER object: ISTexParser
'''
filename = resource["parser"].replace("Parser", '').upper()
module = 'gargantext.util.parsers.%s' %(filename)
module = importlib.import_module(module)
return getattr(module, resource["parser"])
def load_crawler(resource):
'''given a resource load the corresponding Parser()
resource(dict) > Parser(object)
exemple with resource ISTexCrawler
PARSER filename: ISTEX
PARSER object: ISTexCrawler
'''
filename = resource["crawler"].replace("Crawler", "").upper()
module = 'gargantext.util.crawlers.%s' %(filename)
module = importlib.import_module(module)
return getattr(module, resource["crawler"])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment