Commit 6740e4df authored by c24b's avatar c24b

IMPORT constants crawlers+ taggers

parent 9c5c9d7e
# WARNING: to ensure consistency and retrocompatibility, lists should keep the
# initial order (ie., new elements should be appended at the end of the lists)
import importlib
from gargantext.util.lists import *
from gargantext.util.tools import datetime, convert_to_date
import re
# types & models (nodes, lists, hyperdata, resource) ---------------------------------------------
LISTTYPES = {
'DOCUMENT' : WeightedList,
'GROUPLIST' : Translations, # todo remove "LIST" from name
......@@ -19,16 +20,16 @@ LISTTYPES = {
'TFIDF-CORPUS' : WeightedIndex,
'TFIDF-GLOBAL' : WeightedIndex,
'TIRANK-LOCAL' : WeightedIndex, # could be WeightedList
'TIRANK-GLOBAL' : WeightedIndex, # could be WeightedList
}
# 'OWNLIST' : UnweightedList, # £TODO use this for any term-level tags
NODETYPES = [
# TODO separate id not array index, read by models.node
None,
None, # 0
# documents hierarchy
'USER', # 1
'PROJECT', # 2
#RESOURCE should be here but last
'CORPUS', # 3
'DOCUMENT', # 4
# lists
......@@ -45,11 +46,12 @@ NODETYPES = [
'TFIDF-GLOBAL', # 14
# docs subset
'FAVORITES', # 15
# more scores (sorry!)
'TIRANK-LOCAL', # 16
'TIRANK-GLOBAL', # 17
'GENCLUSION', # 18
'RESOURCE', # 19
]
INDEXED_HYPERDATA = {
......@@ -114,34 +116,17 @@ INDEXED_HYPERDATA = {
}
#from gargantext.util.taggers import FrenchMeltTagger, TurboTagger
from gargantext.util.taggers import NltkTagger
LANGUAGES = {
'en': {
#'tagger': EnglishMeltTagger,
#'tagger': TurboTagger,
'tagger': NltkTagger,
},
'fr': {
#'tagger': FrenchMeltTagger,
#'tagger': TreeTagger,
'tagger': NltkTagger,
},
}
from gargantext.util.parsers import \
EuropressParser, RISParser, PubmedParser, ISIParser, CSVParser, ISTexParser, CernParser, RepecParser
# resources ---------------------------------------------
def get_resource(sourcetype):
'''resource :: type => resource dict'''
for n in RESOURCETYPES:
if int(n["type"]) == int(sourcetype):
return n
return None
def resourcetype(name):
'''
resourcetype :: String -> Int
Usage : resourcetype("Europress (English)") == 1
Examples in scrapers scripts (Pubmed or ISTex for instance).
'''
return [n[0] for n in enumerate(r['name'] for r in RESOURCETYPES) if n[1] == name][0]
'''resource :: name => Int'''
return [n["type"] for n in RESOURCETYPES if n["name"] == name][0]
def resourcename(corpus):
'''
......@@ -149,79 +134,144 @@ def resourcename(corpus):
Usage : resourcename(corpus) == "ISTex"
'''
resource = corpus.resources()[0]
resourcename = RESOURCETYPES[resource['type']]['name']
return re.sub(r'\(.*', '', resourcename)
rtype = resource["type"]
return get_resource(rtype)["name"]
def get_tagger(lang):
'''
lang => default langage[0] => tagger
'''
name = LANGUAGES[lang]["tagger"]
module = "gargantext.util.taggers.%s" %(name)
module = importlib.import_module(module, "")
tagger = getattr(module, name)
print(tagger)
return tagger()
RESOURCETYPES = [
# type 0
{ 'name': 'Select database below',
'parser': None,
'default_language': None,
},
# type 1
{ 'name': 'Europress (English)',
'parser': EuropressParser,
'default_language': 'en',
{ "type":1,
'name': 'Europress',
'format': 'Europress',
'parser': "EuropressParser",
'file_formats':["zip"],
'crawler': None,
'default_languages': ['en', 'fr'],
},
# type 2
{ 'name': 'Europress (French)',
'parser': EuropressParser,
'default_language': 'fr',
{ 'type': 2,
'name': 'Jstor',
'format': 'RIS',
'parser': "RISParser",
'file_formats':["zip"],
'crawler': None,
'default_languages': ['en'],
},
# type 3
{ 'name': 'Jstor (RIS format)',
'parser': RISParser,
'default_language': 'en',
{ 'type': 3,
'name': 'Pubmed',
'format': 'Pubmed',
'parser': "PubmedParser",
'file_formats':["zip", "xml"],
'crawler': "PubmedCrawler",
'default_languages': ['en'],
},
# type 4
{ 'name': 'Pubmed (XML format)',
'parser': PubmedParser,
'default_language': 'en',
{ 'type':4,
'name': 'Scopus',
'format': 'RIS',
'parser': "RISParser",
'file_formats':["zip"],
'crawler': None,
'default_languages': ['en'],
},
# type 5
{ 'name': 'Scopus (RIS format)',
'parser': RISParser,
'default_language': 'en',
{ 'type':5,
'name': 'Web of Science',
'format': 'ISI',
'parser': "ISIParser",
'file_formats':["zip"],
#'crawler': "ISICrawler",
'crawler': None,
'default_languages': ['en'],
},
# type 6
{ 'name': 'Web of Science (ISI format)',
'parser': ISIParser,
'default_language': 'en',
{ 'type':6,
'name': 'Zotero',
'format': 'RIS',
'parser': 'RISParser',
'file_formats':["zip"],
'crawler': None,
'default_languages': ['en'],
},
# type 7
{ 'name': 'Zotero (RIS format)',
'parser': RISParser,
'default_language': 'en',
{ 'type':7,
'name': 'CSV',
'format': 'CSV',
'parser': 'CSVParser',
'file_formats':["zip", "csv"],
'crawler': None,
'default_languages': ['en'],
},
# type 8
{ 'name': 'CSV',
'parser': CSVParser,
'default_language': 'en',
{ 'type': 8,
'name': 'ISTex',
'format': 'ISI',
'parser': "ISTexParser",
'file_formats':["zip"],
#'crawler': "ISICrawler",
'crawler': None,
'default_languages': ['en', 'fr'],
},
# type 9
{ 'name': 'ISTex',
'parser': ISTexParser,
'default_language': 'en',
{ "type":9,
"name": 'SCOAP (XML MARC21 Format)',
"parser": "CernParser",
"format": 'MARC21',
'file_formats':["zip","xml"],
"crawler": "CernCrawler",
'default_languages': ['en'],
},
]
#shortcut for resources declaration
PARSERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["parser"] is not None]
CRAWLERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["crawler"] is not None]
def load_parser(resource):
filename = resource.parser.replace("Parser", '')
module = 'gargantext.util.parsers.%s' %(filename)
module = importlib.import_module(module)
return getattr(module, resource.parser)
def load_crawler(resource):
filename = name.replace("Crawler", "")
module = 'gargantext.util.crawlers.%s' %(filename)
module = importlib.import_module(module)
return getattr(module, resource.crawler)
# Supported languages and taggers ---------------------------------------------
#first declare the tagger using a string
#and it will be imported into gargantext.utils.taggers
LANGUAGES = {
'en': {
#'tagger': 'EnglishMeltTagger',
#'tagger': "TurboTagger",
'tagger': 'NltkTagger',
},
# type 10
{ "type":10,
"name": 'SCOAP (XML MARC21 Format)',
"parser": CernParser,
"default_language": "en",
'accepted_formats':["zip","xml"],
#~ "scrapper": CernScrapper,
#~ "base_url": "http://api.scoap3.org/search?",
'fr': {
#'tagger': "FrenchMeltTagger",
#'tagger': 'TreeTagger',
'tagger': 'NltkTagger',
},
}
# type 11
{ 'name': 'REPEC (RIS format)',
'parser': RepecParser,
'default_language': 'en',
},
def load_tagger(lang):
filename = LANGUAGES[lang]
module = 'gargantext.util.taggers.%s' %(filename)
module = importlib.import_module(module)
return getattr(module)
]
# linguistic extraction parameters ---------------------------------------------
DEFAULT_RANK_CUTOFF_RATIO = .75 # MAINLIST maximum terms in %
DEFAULT_RANK_HARD_LIMIT = 5000 # MAINLIST maximum terms abs
......@@ -257,19 +307,22 @@ DEFAULT_INDEX_SUBGRAMS = False # False <=> traditional
# "cool example".
# (all 1 to n-1 length ngrams,
# at indexing after extraction)
DEFAULT_INDEX_FIELDS = ('title', 'abstract', ) #Defaults Fields for ngrams extraction
# Grammar rules for chunking
RULE_JJNN = "{<JJ.*>*<NN.*|>+<JJ.*>*}"
RULE_JJDTNN = "{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}"
RULE_TINA = "^((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,)+?(CD.,)??)\
+?((PREP.?|DET.?,|IN.?,|CC.?,|\?,)((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?\
,){0,2}?(N.?.?,|\?,)+?)+?)*?$"
# ngram lists import/export parameters -----------------------------------------
DEFAULT_CSV_DELIM = '\t' # for import/export CSV defaults
DEFAULT_CSV_DELIM_GROUP = '|&|'
# ------------------------------------------------------------------------------
# other parameters
# default number of docs POSTed to scrappers.views.py
# (at page project > add a corpus > scan/process sample)
QUERY_SIZE_N_DEFAULT = 1000
# Files ----------------------------------------------------------------
import os
from .settings import BASE_DIR
# uploads/.gitignore prevents corpora indexing
......@@ -278,7 +331,7 @@ UPLOAD_DIRECTORY = os.path.join(BASE_DIR, 'uploads/corpora')
UPLOAD_LIMIT = 1024 * 1024 * 1024
DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
# Processing -----------------------------------------------------------
# about batch processing...
BATCH_PARSING_SIZE = 256
BATCH_NGRAMSEXTRACTION_SIZE = 3000 # how many distinct ngrams before INTEGRATE
......
import importlib
from gargantext.constants import RESOURCETYPES
from gargantext.settings import DEBUG
#if DEBUG: print("Loading available Crawlers")
base_parser = "gargantext.util.crawlers"
for resource in RESOURCETYPES:
if resource["crawler"] is not None:
try:
name =resource["crawler"]
#crawler is type basename+"Crawler"
filename = name.replace("Crawler", "").lower()
module = base_parser+".%s" %(filename)
importlib.import_module(module)
#if DEBUG: print("\t-", name)
except Exception as e:
print("Check constants.py RESOURCETYPES declaration %s \nCRAWLER %s is not available for %s" %(str(e), resource["crawler"], resource["name"]))
#initial import
#from .cern import CernCrawler
#from .istex import ISTexCrawler
#from .pubmed import PubmedCrawler
from .TurboTagger import TurboTagger
from .NltkTagger import NltkTagger
from .TreeTagger import TreeTagger
from .MeltTagger import EnglishMeltTagger, FrenchMeltTagger
#version2
#imported as needed
#Version 1
#~ import importlib
#~ from gargantext.constants import LANGUAGES
#~ from gargantext.settings import DEBUG
#~ if DEBUG:
#~ print("Loading available Taggers:")
#~ for lang, tagger in LANGUAGES.items():
#~ tagger = tagger["tagger"]
#~ filename = "gargantext.util.taggers.%s" %(tagger)
#~ if DEBUG:
#~ print("\t-%s (%s)" %(tagger, lang))
#~ getattr(importlib.import_module(filename), tagger)()
#VERSION 0
#~ #initally a manual import declaration
#~ from .TurboTagger import TurboTagger
#~ from .NltkTagger import NltkTagger
#~ from .TreeTagger import TreeTagger
#~ from .MeltTagger import EnglishMeltTagger, FrenchMeltTagger
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment