[REFACT] Move nodes related constants & funcs in a separated file

098ec535 · sim · sim · f8aa5546 · 098ec535 · 098ec535
Commit 098ec535 authored Jun 28, 2017 by sim Committed by sim Jul 10, 2017
Hide whitespace changes
Inline Side-by-side

Showing with 288 additions and 261 deletions

constants.py gargantext/constants.py +6 -259

nodes.py gargantext/models/nodes.py +6 -2

nodes_constants.py gargantext/models/nodes_constants.py +276 -0

No files found.
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
 """
-# WARNING: to ensure consistency and retrocompatibility, lists should keep the
-#   initial order (ie., new elements should be appended at the end of the lists)
 abstract:
 ---------
@@ -9,10 +7,6 @@ abstract:
 contents:
 ---------
-      + database constants/ontology
-         - nodetypes
-            (db int <=> named types <=> python code)
      + low-level limits
         - query size
         - max upload size
@@ -20,150 +14,29 @@ contents:
         - word extraction batch size
      + main process config
-         - resourcetypes config (~ input ontology)
         - wordlist generation params
         - graph creation params
         - £TODO sequence of transformations "custom pipeline"
      + subprocess config
-         - crawling, import
         - tagger services and functions
-         - parser services
         - stemmer services
 """
 import os
 import re
 import importlib
-from gargantext.util.lists import WeightedList, UnweightedList, WeightedIndex, \
-                                  WeightedMatrix, Translations
-from gargantext.util.dates import datetime, convert_to_datetime
 from .settings import BASE_DIR
-# types & models (nodes, lists, hyperdata, resource) ---------------------------------------------
+# XXX Originally defined here, imported here for backward-compatibility,
-LISTTYPES = {
+#     should be removed later.
-    'DOCUMENT'     : WeightedList,
+from .models.nodes_constants import NODETYPES, LISTTYPES, INDEXED_HYPERDATA, \
-    'GROUPLIST'    : Translations,   # todo remove "LIST" from name
+                                    RESOURCETYPES, get_resource, get_resource_by_name, \
-    'STOPLIST'     : UnweightedList,
+                                    load_parser, load_crawler
-    'MAINLIST'     : UnweightedList,
-    'MAPLIST'      : UnweightedList,
-    'SPECCLUSION'  : WeightedList,
-    'GENCLUSION'   : WeightedList,
-    'OCCURRENCES'  : WeightedIndex,   # could be WeightedList
-    'COOCCURRENCES': WeightedMatrix,
-    'TFIDF-CORPUS' : WeightedIndex,
-    'TFIDF-GLOBAL' : WeightedIndex,
-    'TIRANK-LOCAL' : WeightedIndex,   # could be WeightedList
-}
-# 'OWNLIST'      : UnweightedList,    # £TODO use this for any term-level tags
-NODETYPES = [
-    # TODO separate id not array index, read by models.node
-    None,                    # 0
-    # documents hierarchy
-    'USER',                  # 1
-    'PROJECT',               # 2
-    #RESOURCE should be here but last
-    'CORPUS',                # 3
-    'DOCUMENT',              # 4
-    # lists
-    'STOPLIST',              # 5
-    'GROUPLIST',             # 6
-    'MAINLIST',              # 7
-    'MAPLIST',               # 8
-    'COOCCURRENCES',         # 9
-    # scores
-    'OCCURRENCES',           # 10
-    'SPECCLUSION',           # 11
-    'CVALUE',                # 12
-    'TFIDF-CORPUS',          # 13
-    'TFIDF-GLOBAL',          # 14
-    # docs subset
-    'FAVORITES',             # 15
-    # more scores (sorry!)
-    'TIRANK-LOCAL',          # 16
-    'TIRANK-GLOBAL',         # 17
-    'GENCLUSION',            # 18
-    'RESOURCE',              # 19
-]
-INDEXED_HYPERDATA = {
-    # TODO use properties during toolchain.hyperdata_indexing
-    # (type, convert_to_db, convert_from_db)
-    'count':
-        { 'id'             : 1
-        , 'type'           : int
-        , 'convert_to_db'  : int
-        , 'convert_from_db': int
-        },
-    'publication_date':
-        { 'id'             : 2
-        , 'type'           : datetime
-        , 'convert_to_db'  : convert_to_datetime
-        , 'convert_from_db': convert_to_datetime
-        },
-    'title':
-        { 'id'             : 3
-        , 'type'           : str
-        , 'convert_to_db'  : str
-        , 'convert_from_db': str
-        },
-    'authors':
-        { 'id'             : 4
-        , 'type'           : str
-        , 'convert_to_db'  : str
-        , 'convert_from_db': str
-        },
-    'journal':
-        { 'id'             : 5
-        , 'type'           : str
-        , 'convert_to_db'  : str
-        , 'convert_from_db': str
-        },
-    'abstract':
-        { 'id'             : 6
-        , 'type'           : str
-        , 'convert_to_db'  : str
-        , 'convert_from_db': str
-        },
-    # 'text':
-    #     { 'id'             : 7
-    #     , 'type'           : str
-    #     , 'convert_to_db'  : str
-    #     , 'convert_from_db': str
-    #     },
-    #
-    # 'page':
-    #     { 'id'             : 8
-    #     , 'type'           : int
-    #     , 'convert_to_db'  : int
-    #     , 'convert_from_db': int
-    #     },
-}
 # user parameters----------------------------------------
 USER_LANG = ["fr", "en"]
-# resources ---------------------------------------------
-def get_resource(sourcetype):
-    '''resource :: type => resource dict'''
-    return RESOURCETYPES[sourcetype-1]
-def get_resource_by_name(sourcename):
-    '''resource :: name => resource dict'''
-    for n in RESOURCETYPES:
-        if str(n["name"]) == str(sourcename):
-            return n
 # taggers -----------------------------------------------
 def get_tagger(lang):
    '''
@@ -176,132 +49,6 @@ def get_tagger(lang):
    return tagger()
-RESOURCETYPES = [
-    {   "type": 1,
-        'name': 'Europresse',
-        'format': 'Europresse',
-        'parser': "EuropresseParser",
-        'file_formats':["zip", "txt"],
-        'crawler': None,
-    },
-    {   'type': 2,
-        'name': 'Jstor [RIS]',
-        'format': 'RIS',
-        'parser': "RISParser",
-        'file_formats':["zip", "txt"],
-        'crawler': None,
-    },
-    {   'type': 3,
-        'name': 'Pubmed [XML]',
-        'format': 'Pubmed',
-        'parser': "PubmedParser",
-        'file_formats':["zip", "xml"],
-        'crawler': "PubmedCrawler",
-    },
-    {   'type': 4,
-        'name': 'Scopus [RIS]',
-        'format': 'RIS',
-        'parser': "RISParser",
-        'file_formats':["zip", "txt"],
-        'crawler': None,
-    },
-    {   'type': 5,
-        'name': 'Web of Science [ISI]',
-        'format': 'ISI',
-        'parser': "ISIParser",
-        'file_formats':["zip", "txt", "isi"],
-        #'crawler': "ISICrawler",
-        'crawler': None,
-    },
-    {   'type': 6,
-        'name': 'Zotero [RIS]',
-        'format': 'RIS',
-        'parser': 'RISParser',
-        'file_formats':["zip", "ris", "txt"],
-        'crawler': None,
-    },
-    {   'type': 7,
-        'name': 'CSV',
-        'format': 'CSV',
-        'parser': 'CSVParser',
-        'file_formats':["zip", "csv"],
-        'crawler': None,
-    },
-    {   'type': 8,
-        'name': 'ISTex',
-        'format': 'json',
-        'parser': "ISTexParser",
-        'file_formats':["zip", "txt"],
-        'crawler': None,
-    },
-   {    "type": 9,
-        "name": 'SCOAP [API/XML]',
-        "parser": "CernParser",
-        "format": 'MARC21',
-        'file_formats':["zip","xml"],
-        "crawler": "CernCrawler",
-   },
-#   {    "type": 10,
-#        "name": 'REPEC [RIS]',
-#        "parser": "RISParser",
-#        "format": 'RIS',
-#        'file_formats':["zip","ris", "txt"],
-#        "crawler": None,
-#   },
-#
-   {    "type": 10,
-        "name": 'REPEC [MULTIVAC API]',
-        "parser": "MultivacParser",
-        "format": 'JSON',
-        'file_formats':["zip","json"],
-        "crawler": "MultivacCrawler",
-   },
-   {    "type": 11,
-        "name": 'HAL [API]',
-        "parser": "HalParser",
-        "format": 'JSON',
-        'file_formats':["zip","json"],
-        "crawler": "HalCrawler",
-   },
-   {    "type": 12,
-        "name": 'ISIDORE [SPARQLE API /!\ BETA]',
-        "parser": "IsidoreParser",
-        "format": 'JSON',
-        'file_formats':["zip","json"],
-        "crawler": "IsidoreCrawler",
-   },
-]
-def load_parser(resource):
-    '''given a resource load the corresponding Parser
-    resource(dict) > Parser(object)
-    exemple with resource ISTexParser
-    PARSER filename: ISTEX
-    PARSER object: ISTexParser
-    '''
-    filename = resource["parser"].replace("Parser", '').upper()
-    module = 'gargantext.util.parsers.%s' %(filename)
-    module = importlib.import_module(module)
-    return getattr(module, resource["parser"])
-def load_crawler(resource):
-    '''given a resource load the corresponding Parser()
-    resource(dict) > Parser(object)
-    exemple with resource ISTexCrawler
-    PARSER filename: ISTEX
-    PARSER object: ISTexCrawler
-    '''
-    filename = resource["crawler"].replace("Crawler", "").upper()
-    module = 'gargantext.util.crawlers.%s' %(filename)
-    module = importlib.import_module(module)
-    return getattr(module, resource["crawler"])
 # Supported languages and taggers ---------------------------------------------
 #first declare the tagger using a string
 #and it will be imported into gargantext.utils.taggers
@@ -329,7 +76,6 @@ def load_tagger(lang):
    return getattr(module, filename)()
 # linguistic extraction parameters ---------------------------------------------
 DEFAULT_RANK_CUTOFF_RATIO      = .75         # MAINLIST maximum terms in %
@@ -367,6 +113,7 @@ DEFAULT_INDEX_SUBGRAMS         = False        # False <=> traditional
                                             #  "cool example".
                                             #   (all 1 to n-1 length ngrams,
                                             #    at indexing after extraction)
 # TAGGING options   -----------------------------------------
 #activate lang detection?
 DETECT_LANG = False

--- a/gargantext/models/nodes.py
+++ b/gargantext/models/nodes.py
 from gargantext.util.db import session
 from gargantext.util.files import upload
-from gargantext.constants import *
 from datetime import datetime
@@ -8,8 +7,11 @@ from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index,
                  Integer, Float, String, DateTime, JSONB, \
                  MutableList, MutableDict
 from .users import User
+from .nodes_constants import NODETYPES, LISTTYPES, INDEXED_HYPERDATA
+__all__ = ['NODETYPES', 'LISTTYPES', 'INDEXED_HYPERDATA',
+           'Node', 'NodeNode', 'CorpusNode']
-__all__ = ['Node', 'NodeNode', 'CorpusNode']
 class NodeType(TypeDecorator):
    """Define a new type of column to describe a Node's type.
@@ -17,8 +19,10 @@ class NodeType(TypeDecorator):
    Values are detailed in `gargantext.constants.NODETYPES`.
    """
    impl = Integer
    def process_bind_param(self, typename, dialect):
        return NODETYPES.index(typename)
    def process_result_value(self, typeindex, dialect):
        return NODETYPES[typeindex]

--- a/gargantext/models/nodes_constants.py
+++ b/gargantext/models/nodes_constants.py
+"""
+# WARNING: to ensure consistency and retrocompatibility, lists should keep the
+#   initial order (ie., new elements should be appended at the end of the lists)
+abstract:
+---------
+        constants and utility functions related to nodes
+        initially in gargantext.constants
+contents:
+---------
+      + database constants/ontology
+         - nodetypes
+            (db int <=> named types <=> python code)
+      + main process config
+         - resourcetypes config (~ input ontology)
+      + subprocess config
+         - crawling, import
+         - parser services
+"""
+import importlib
+from ..util.lists import WeightedList, UnweightedList, WeightedIndex, \
+                         WeightedMatrix, Translations
+from ..util.dates import datetime, convert_to_datetime
+# types & models (nodes, lists, hyperdata, resource) ---------------------------------------------
+NODETYPES = [
+    # TODO separate id not array index, read by models.node
+    None,                    # 0
+    # documents hierarchy
+    'USER',                  # 1
+    'PROJECT',               # 2
+    #RESOURCE should be here but last
+    'CORPUS',                # 3
+    'DOCUMENT',              # 4
+    # lists
+    'STOPLIST',              # 5
+    'GROUPLIST',             # 6
+    'MAINLIST',              # 7
+    'MAPLIST',               # 8
+    'COOCCURRENCES',         # 9
+    # scores
+    'OCCURRENCES',           # 10
+    'SPECCLUSION',           # 11
+    'CVALUE',                # 12
+    'TFIDF-CORPUS',          # 13
+    'TFIDF-GLOBAL',          # 14
+    # docs subset
+    'FAVORITES',             # 15
+    # more scores (sorry!)
+    'TIRANK-LOCAL',          # 16
+    'TIRANK-GLOBAL',         # 17
+    'GENCLUSION',            # 18
+    'RESOURCE',              # 19
+]
+LISTTYPES = {
+    'DOCUMENT'     : WeightedList,
+    'GROUPLIST'    : Translations,   # todo remove "LIST" from name
+    'STOPLIST'     : UnweightedList,
+    'MAINLIST'     : UnweightedList,
+    'MAPLIST'      : UnweightedList,
+    'SPECCLUSION'  : WeightedList,
+    'GENCLUSION'   : WeightedList,
+    'OCCURRENCES'  : WeightedIndex,   # could be WeightedList
+    'COOCCURRENCES': WeightedMatrix,
+    'TFIDF-CORPUS' : WeightedIndex,
+    'TFIDF-GLOBAL' : WeightedIndex,
+    'TIRANK-LOCAL' : WeightedIndex,   # could be WeightedList
+}
+# 'OWNLIST'      : UnweightedList,    # £TODO use this for any term-level tags
+INDEXED_HYPERDATA = {
+    # TODO use properties during toolchain.hyperdata_indexing
+    # (type, convert_to_db, convert_from_db)
+    'count':
+        { 'id'             : 1
+        , 'type'           : int
+        , 'convert_to_db'  : int
+        , 'convert_from_db': int
+        },
+    'publication_date':
+        { 'id'             : 2
+        , 'type'           : datetime
+        , 'convert_to_db'  : convert_to_datetime
+        , 'convert_from_db': convert_to_datetime
+        },
+    'title':
+        { 'id'             : 3
+        , 'type'           : str
+        , 'convert_to_db'  : str
+        , 'convert_from_db': str
+        },
+    'authors':
+        { 'id'             : 4
+        , 'type'           : str
+        , 'convert_to_db'  : str
+        , 'convert_from_db': str
+        },
+    'journal':
+        { 'id'             : 5
+        , 'type'           : str
+        , 'convert_to_db'  : str
+        , 'convert_from_db': str
+        },
+    'abstract':
+        { 'id'             : 6
+        , 'type'           : str
+        , 'convert_to_db'  : str
+        , 'convert_from_db': str
+        },
+    # 'text':
+    #     { 'id'             : 7
+    #     , 'type'           : str
+    #     , 'convert_to_db'  : str
+    #     , 'convert_from_db': str
+    #     },
+    #
+    # 'page':
+    #     { 'id'             : 8
+    #     , 'type'           : int
+    #     , 'convert_to_db'  : int
+    #     , 'convert_from_db': int
+    #     },
+}
+# resources ---------------------------------------------
+def get_resource(sourcetype):
+    '''resource :: type => resource dict'''
+    return RESOURCETYPES[sourcetype-1]
+def get_resource_by_name(sourcename):
+    '''resource :: name => resource dict'''
+    for n in RESOURCETYPES:
+        if str(n["name"]) == str(sourcename):
+            return n
+RESOURCETYPES = [
+    {   "type": 1,
+        'name': 'Europresse',
+        'format': 'Europresse',
+        'parser': "EuropresseParser",
+        'file_formats':["zip", "txt"],
+        'crawler': None,
+    },
+    {   'type': 2,
+        'name': 'Jstor [RIS]',
+        'format': 'RIS',
+        'parser': "RISParser",
+        'file_formats':["zip", "txt"],
+        'crawler': None,
+    },
+    {   'type': 3,
+        'name': 'Pubmed [XML]',
+        'format': 'Pubmed',
+        'parser': "PubmedParser",
+        'file_formats':["zip", "xml"],
+        'crawler': "PubmedCrawler",
+    },
+    {   'type': 4,
+        'name': 'Scopus [RIS]',
+        'format': 'RIS',
+        'parser': "RISParser",
+        'file_formats':["zip", "txt"],
+        'crawler': None,
+    },
+    {   'type': 5,
+        'name': 'Web of Science [ISI]',
+        'format': 'ISI',
+        'parser': "ISIParser",
+        'file_formats':["zip", "txt", "isi"],
+        #'crawler': "ISICrawler",
+        'crawler': None,
+    },
+    {   'type': 6,
+        'name': 'Zotero [RIS]',
+        'format': 'RIS',
+        'parser': 'RISParser',
+        'file_formats':["zip", "ris", "txt"],
+        'crawler': None,
+    },
+    {   'type': 7,
+        'name': 'CSV',
+        'format': 'CSV',
+        'parser': 'CSVParser',
+        'file_formats':["zip", "csv"],
+        'crawler': None,
+    },
+    {   'type': 8,
+        'name': 'ISTex',
+        'format': 'json',
+        'parser': "ISTexParser",
+        'file_formats':["zip", "txt"],
+        'crawler': None,
+    },
+   {    "type": 9,
+        "name": 'SCOAP [API/XML]',
+        "parser": "CernParser",
+        "format": 'MARC21',
+        'file_formats':["zip","xml"],
+        "crawler": "CernCrawler",
+   },
+#   {    "type": 10,
+#        "name": 'REPEC [RIS]',
+#        "parser": "RISParser",
+#        "format": 'RIS',
+#        'file_formats':["zip","ris", "txt"],
+#        "crawler": None,
+#   },
+#
+   {    "type": 10,
+        "name": 'REPEC [MULTIVAC API]',
+        "parser": "MultivacParser",
+        "format": 'JSON',
+        'file_formats':["zip","json"],
+        "crawler": "MultivacCrawler",
+   },
+   {    "type": 11,
+        "name": 'HAL [API]',
+        "parser": "HalParser",
+        "format": 'JSON',
+        'file_formats':["zip","json"],
+        "crawler": "HalCrawler",
+   },
+   {    "type": 12,
+        "name": 'ISIDORE [SPARQLE API /!\ BETA]',
+        "parser": "IsidoreParser",
+        "format": 'JSON',
+        'file_formats':["zip","json"],
+        "crawler": "IsidoreCrawler",
+   },
+]
+def load_parser(resource):
+    '''given a resource load the corresponding Parser
+    resource(dict) > Parser(object)
+    exemple with resource ISTexParser
+    PARSER filename: ISTEX
+    PARSER object: ISTexParser
+    '''
+    filename = resource["parser"].replace("Parser", '').upper()
+    module = 'gargantext.util.parsers.%s' %(filename)
+    module = importlib.import_module(module)
+    return getattr(module, resource["parser"])
+def load_crawler(resource):
+    '''given a resource load the corresponding Parser()
+    resource(dict) > Parser(object)
+    exemple with resource ISTexCrawler
+    PARSER filename: ISTEX
+    PARSER object: ISTexCrawler
+    '''
+    filename = resource["crawler"].replace("Crawler", "").upper()
+    module = 'gargantext.util.crawlers.%s' %(filename)
+    module = importlib.import_module(module)
+    return getattr(module, resource["crawler"])