Commit e3c7663f authored by Administrator's avatar Administrator

Merge branch 'unstable' into celery

parents c862a29b d086dfd5
...@@ -104,30 +104,15 @@ except Exception as error: ...@@ -104,30 +104,15 @@ except Exception as error:
# In[33]: # In[33]:
try: from parsing.parsers_config import parsers
typePubmed = ResourceType.objects.get(name='pubmed')
typeIsi = ResourceType.objects.get(name='isi')
typeRis = ResourceType.objects.get(name='ris')
typePresseFrench = ResourceType.objects.get(name='europress_french')
typePresseEnglish = ResourceType.objects.get(name='europress_english')
except Exception as error: ResourceType.objects.all().delete()
print(error)
for key in parsers.keys():
typePubmed = ResourceType(name='pubmed') try:
typePubmed.save() ResourceType.objects.get_or_create(name=key)
except Exception as error:
typeIsi = ResourceType(name='isi') print("Ressource Error: ", error)
typeIsi.save()
typeRis = ResourceType(name='ris')
typeRis.save()
typePresseFrench = ResourceType(name='europress_french')
typePresseFrench.save()
typePresseEnglish = ResourceType(name='europress_english')
typePresseEnglish.save()
# In[34]: # In[34]:
......
...@@ -4,6 +4,7 @@ psql -d gargandb -f init.sql ...@@ -4,6 +4,7 @@ psql -d gargandb -f init.sql
sleep 2 sleep 2
../manage.py syncdb ../manage.py syncdb
psql -d gargandb -f init2.sql psql -d gargandb -f init2.sql
......
ALTER TABLE ONLY node_node ALTER COLUMN date SET DEFAULT CURRENT_DATE ;
...@@ -54,16 +54,13 @@ import pycountry ...@@ -54,16 +54,13 @@ import pycountry
Language.objects.all().delete() Language.objects.all().delete()
for language in pycountry.languages: for language in pycountry.languages:
if 'alpha2' in language.__dict__: if 'alpha2' in language.__dict__:
Language( models.Language(
iso2 = language.alpha2, iso2 = language.alpha2,
iso3 = language.bibliographic, iso3 = language.bibliographic,
fullname = language.name, fullname = language.name,
implemented = 1 if language.alpha2 in ['en', 'fr'] else 0, implemented = 1 if language.alpha2 in ['en', 'fr'] else 0,
).save() ).save()
english = Language.objects.get(iso2='en')
french = Language.objects.get(iso2='fr')
# Integration: users # Integration: users
...@@ -99,56 +96,10 @@ for node_type in node_types: ...@@ -99,56 +96,10 @@ for node_type in node_types:
print('Initialize resource...') print('Initialize resource...')
resources = [ from parsing.parsers_config import parsers
'pubmed', 'isi', 'ris', 'europress_french', 'europress_english']
for resource in resources:
models.ResourceType.objects.get_or_create(name=resource)
# TODO
# here some tests
# add a new project and some corpora to test it
# Integration: project
#
#print('Initialize project...')
#try:
# project = Node.objects.get(name='Bees project')
#except:
# project = Node(name='Bees project', type=typeProject, user=me)
# project.save()
#
# Integration: corpus
#print('Initialize corpus...')
#try:
# corpus_pubmed = Node.objects.get(name='PubMed corpus')
#except:
# corpus_pubmed = Node(parent=project, name='PubMed corpus', type=typeCorpus, user=me)
# corpus_pubmed.save()
#
#print('Initialize resource...')
#corpus_pubmed.add_resource(
# # file='./data_samples/pubmed.zip',
# #file='./data_samples/pubmed_2013-04-01_HoneyBeesBeeBees.xml',
# file='/srv/gargantext_lib/data_samples/pubmed.xml',
# type=typePubmed,
# user=me
#)
#
#for resource in corpus_pubmed.get_resources():
# print('Resource #%d - %s - %s' % (resource.id, resource.digest, resource.file))
#
## print('Parse corpus #%d...' % (corpus_pubmed.id, ))
# corpus_pubmed.parse_resources(verbose=True)
# print('Extract corpus #%d...' % (corpus_pubmed.id, ))
# corpus_pubmed.children.all().extract_ngrams(['title',])
# print('Parsed corpus #%d.' % (corpus_pubmed.id, ))
for parser in parsers.keys():
models.ResourceType.objects.get_or_create(name=parser)
......
...@@ -29,6 +29,7 @@ from celery import current_app ...@@ -29,6 +29,7 @@ from celery import current_app
import os import os
import subprocess import subprocess
from parsing.parsers_config import parsers
# Some usefull functions # Some usefull functions
# TODO: start the function name with an underscore (private) # TODO: start the function name with an underscore (private)
...@@ -194,15 +195,19 @@ class Node(CTENode): ...@@ -194,15 +195,19 @@ class Node(CTENode):
print("= = = = = = = = = = =\n") print("= = = = = = = = = = =\n")
for node_resource in self.node_resource.filter(parsed=False): for node_resource in self.node_resource.filter(parsed=False):
resource = node_resource.resource resource = node_resource.resource
parser = defaultdict(lambda:FileParser.FileParser, { parser = defaultdict(lambda:FileParser.FileParser, parsers
'istext' : ISText, # {
'pubmed' : PubmedFileParser, # 'istext' : ISText,
'isi' : IsiFileParser, # 'pubmed' : PubmedFileParser,
'ris' : RisFileParser, # 'isi' : IsiFileParser,
'europress' : EuropressFileParser, # 'ris' : RisFileParser,
'europress_french' : EuropressFileParser, # 'RIS (Jstor)' : JstorFileParser,
'europress_english' : EuropressFileParser, # 'europress' : EuropressFileParser,
})[resource.type.name]() # 'europress_french' : EuropressFileParser,
# 'europress_english' : EuropressFileParser,
# }
)[resource.type.name]()
metadata_list += parser.parse(str(resource.file)) metadata_list += parser.parse(str(resource.file))
type_id = NodeType.objects.get(name='Document').id type_id = NodeType.objects.get(name='Document').id
langages_cache = LanguagesCache() langages_cache = LanguagesCache()
......
from .RisFileParser import RisFileParser from .RisFileParser import RisFileParser
class IsiFileParser(RisFileParser): class IsiFileParser(RisFileParser):
_parameters = { _parameters = {
......
from .RisFileParser import RisFileParser
class JstorFileParser(RisFileParser):
_parameters = {
b"ER": {"type": "delimiter"},
b"TI": {"type": "metadata", "key": "title", "separator": " "},
b"AU": {"type": "metadata", "key": "authors", "separator": ", "},
b"UR": {"type": "metadata", "key": "doi"},
b"Y1": {"type": "metadata", "key": "publication_year"},
b"PD": {"type": "metadata", "key": "publication_month"},
b"LA": {"type": "metadata", "key": "language_iso2"},
b"AB": {"type": "metadata", "key": "abstract", "separator": " "},
b"WC": {"type": "metadata", "key": "fields"},
}
from .RisFileParser import RisFileParser from .RisFileParser import RisFileParser
from .IsiFileParser import IsiFileParser from .IsiFileParser import IsiFileParser
from .JstorFileParser import JstorFileParser
from .PubmedFileParser import PubmedFileParser from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser import EuropressFileParser from .EuropressFileParser import EuropressFileParser
from .ISText import ISText from .ISText import ISText
...@@ -7,7 +7,6 @@ from math import log ...@@ -7,7 +7,6 @@ from math import log
from gargantext_web.db import * from gargantext_web.db import *
from .FileParsers import *
...@@ -30,15 +29,7 @@ class DebugTime: ...@@ -30,15 +29,7 @@ class DebugTime:
# keep all the parsers in a cache # keep all the parsers in a cache
class Parsers(defaultdict): class Parsers(defaultdict):
from .parsers_config import parsers as _parsers
_parsers = {
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
'europress_french' : EuropressFileParser,
'europress_english' : EuropressFileParser,
}
def __missing__(self, key): def __missing__(self, key):
if key not in self._parsers: if key not in self._parsers:
......
from .FileParsers import *
parsers = {
'Pubmed (xml format)' : PubmedFileParser,
'Web of Science (ISI format)' : IsiFileParser,
'Scopus ou Zotero (RIS format)' : RisFileParser,
'Jstor (RIS format)' : JstorFileParser,
#'Europress' : EuropressFileParser,
'Europress (French)' : EuropressFileParser,
'Europress (English)' : EuropressFileParser,
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment