Commit e3c7663f authored by Administrator's avatar Administrator

Merge branch 'unstable' into celery

parents c862a29b d086dfd5
......@@ -104,30 +104,15 @@ except Exception as error:
# In[33]:
try:
typePubmed = ResourceType.objects.get(name='pubmed')
typeIsi = ResourceType.objects.get(name='isi')
typeRis = ResourceType.objects.get(name='ris')
typePresseFrench = ResourceType.objects.get(name='europress_french')
typePresseEnglish = ResourceType.objects.get(name='europress_english')
except Exception as error:
print(error)
from parsing.parsers_config import parsers
typePubmed = ResourceType(name='pubmed')
typePubmed.save()
ResourceType.objects.all().delete()
typeIsi = ResourceType(name='isi')
typeIsi.save()
typeRis = ResourceType(name='ris')
typeRis.save()
typePresseFrench = ResourceType(name='europress_french')
typePresseFrench.save()
typePresseEnglish = ResourceType(name='europress_english')
typePresseEnglish.save()
for key in parsers.keys():
try:
ResourceType.objects.get_or_create(name=key)
except Exception as error:
print("Ressource Error: ", error)
# In[34]:
......
......@@ -4,6 +4,7 @@ psql -d gargandb -f init.sql
sleep 2
../manage.py syncdb
psql -d gargandb -f init2.sql
......
ALTER TABLE ONLY node_node ALTER COLUMN date SET DEFAULT CURRENT_DATE ;
......@@ -54,16 +54,13 @@ import pycountry
Language.objects.all().delete()
for language in pycountry.languages:
if 'alpha2' in language.__dict__:
Language(
models.Language(
iso2 = language.alpha2,
iso3 = language.bibliographic,
fullname = language.name,
implemented = 1 if language.alpha2 in ['en', 'fr'] else 0,
).save()
english = Language.objects.get(iso2='en')
french = Language.objects.get(iso2='fr')
# Integration: users
......@@ -99,56 +96,10 @@ for node_type in node_types:
print('Initialize resource...')
resources = [
'pubmed', 'isi', 'ris', 'europress_french', 'europress_english']
for resource in resources:
models.ResourceType.objects.get_or_create(name=resource)
# TODO
# here some tests
# add a new project and some corpora to test it
# Integration: project
#
#print('Initialize project...')
#try:
# project = Node.objects.get(name='Bees project')
#except:
# project = Node(name='Bees project', type=typeProject, user=me)
# project.save()
#
# Integration: corpus
#print('Initialize corpus...')
#try:
# corpus_pubmed = Node.objects.get(name='PubMed corpus')
#except:
# corpus_pubmed = Node(parent=project, name='PubMed corpus', type=typeCorpus, user=me)
# corpus_pubmed.save()
#
#print('Initialize resource...')
#corpus_pubmed.add_resource(
# # file='./data_samples/pubmed.zip',
# #file='./data_samples/pubmed_2013-04-01_HoneyBeesBeeBees.xml',
# file='/srv/gargantext_lib/data_samples/pubmed.xml',
# type=typePubmed,
# user=me
#)
#
#for resource in corpus_pubmed.get_resources():
# print('Resource #%d - %s - %s' % (resource.id, resource.digest, resource.file))
#
## print('Parse corpus #%d...' % (corpus_pubmed.id, ))
# corpus_pubmed.parse_resources(verbose=True)
# print('Extract corpus #%d...' % (corpus_pubmed.id, ))
# corpus_pubmed.children.all().extract_ngrams(['title',])
# print('Parsed corpus #%d.' % (corpus_pubmed.id, ))
from parsing.parsers_config import parsers
for parser in parsers.keys():
models.ResourceType.objects.get_or_create(name=parser)
......
......@@ -29,6 +29,7 @@ from celery import current_app
import os
import subprocess
from parsing.parsers_config import parsers
# Some usefull functions
# TODO: start the function name with an underscore (private)
......@@ -194,15 +195,19 @@ class Node(CTENode):
print("= = = = = = = = = = =\n")
for node_resource in self.node_resource.filter(parsed=False):
resource = node_resource.resource
parser = defaultdict(lambda:FileParser.FileParser, {
'istext' : ISText,
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
'europress_french' : EuropressFileParser,
'europress_english' : EuropressFileParser,
})[resource.type.name]()
parser = defaultdict(lambda:FileParser.FileParser, parsers
# {
# 'istext' : ISText,
# 'pubmed' : PubmedFileParser,
# 'isi' : IsiFileParser,
# 'ris' : RisFileParser,
# 'RIS (Jstor)' : JstorFileParser,
# 'europress' : EuropressFileParser,
# 'europress_french' : EuropressFileParser,
# 'europress_english' : EuropressFileParser,
# }
)[resource.type.name]()
metadata_list += parser.parse(str(resource.file))
type_id = NodeType.objects.get(name='Document').id
langages_cache = LanguagesCache()
......
from .RisFileParser import RisFileParser
class IsiFileParser(RisFileParser):
_parameters = {
......
from .RisFileParser import RisFileParser
class JstorFileParser(RisFileParser):
_parameters = {
b"ER": {"type": "delimiter"},
b"TI": {"type": "metadata", "key": "title", "separator": " "},
b"AU": {"type": "metadata", "key": "authors", "separator": ", "},
b"UR": {"type": "metadata", "key": "doi"},
b"Y1": {"type": "metadata", "key": "publication_year"},
b"PD": {"type": "metadata", "key": "publication_month"},
b"LA": {"type": "metadata", "key": "language_iso2"},
b"AB": {"type": "metadata", "key": "abstract", "separator": " "},
b"WC": {"type": "metadata", "key": "fields"},
}
from .RisFileParser import RisFileParser
from .IsiFileParser import IsiFileParser
from .JstorFileParser import JstorFileParser
from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser import EuropressFileParser
from .ISText import ISText
......@@ -7,7 +7,6 @@ from math import log
from gargantext_web.db import *
from .FileParsers import *
......@@ -30,15 +29,7 @@ class DebugTime:
# keep all the parsers in a cache
class Parsers(defaultdict):
_parsers = {
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
'europress_french' : EuropressFileParser,
'europress_english' : EuropressFileParser,
}
from .parsers_config import parsers as _parsers
def __missing__(self, key):
if key not in self._parsers:
......
from .FileParsers import *
parsers = {
'Pubmed (xml format)' : PubmedFileParser,
'Web of Science (ISI format)' : IsiFileParser,
'Scopus ou Zotero (RIS format)' : RisFileParser,
'Jstor (RIS format)' : JstorFileParser,
#'Europress' : EuropressFileParser,
'Europress (French)' : EuropressFileParser,
'Europress (English)' : EuropressFileParser,
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment