Commit 1f000317 authored by Mathieu Rodic's avatar Mathieu Rodic

[FEAT] uploading projects is now functional

(documents are parsed from resources, but ngrams are not extracted yet)
parent bd991379
......@@ -10,9 +10,12 @@ NODETYPES = [
]
LANGUAGES = {
# 'fr': {
# 'tagger': FrenchNgramsTagger
# }
'fr': {
# 'tagger': FrenchNgramsTagger
},
'en': {
# 'tagger': EnglishNgramsTagger
},
}
......@@ -66,5 +69,5 @@ QUERY_SIZE_N_DEFAULT = 1000
import os
from .settings import BASE_DIR
UPLOAD_DIRECTORY = os.path.join(BASE_DIR, 'uploads')
UPLOAD_LIMIT = 16 * 1024 * 1024
UPLOAD_LIMIT = 1024 * 1024 * 1024
DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
......@@ -33,6 +33,10 @@ class Node(Base):
# metadata
hyperdata = Column(JSONB, default={})
def __init__(self, **kwargs):
Base.__init__(self, **kwargs)
self.hyperdata = {}
def __getitem__(self, key):
return self.hyperdata[key]
......@@ -48,26 +52,37 @@ class Node(Base):
query = query.filter(Node.typename == typename)
return query
def add_child(self, typename, **kwargs):
def add_child(self, **kwargs):
"""Create and return a new direct child of the current node.
"""
return Node(
user_id = self.user_id,
typename = typename,
parent_id = self.id,
**kwargs
)
def add_corpus(self, name, resource_type, resource_upload=None, resource_url=None):
if resource_upload is not None:
resource_path = upload(resource_upload)
def resources(self):
if 'resources' not in self.hyperdata:
self.hyperdata['resources'] = []
return self['resources']
def add_resource(self, type, path=None, url=None):
self.resources().append({'type': type, 'path':path, 'url':url})
def status(self, action=None, progress=None, autocommit=False):
if 'status' not in self.hyperdata:
self['status'] = {'action': action, 'progress': progress}
else:
resource_path = None
corpus = self.add_child('CORPUS', name=name, hyperdata={
'resource_type': int(resource_type),
'resource_path': resource_path,
'resource_url': resource_url,
})
session.add(corpus)
session.commit()
return corpus
if action is not None:
self['status']['action'] = action
if progress is not None:
self['status']['progress'] = progress
if autocommit:
hyperdata = self.hyperdata.copy()
self.hyperdata = None
session.add(self)
session.commit()
self.hyperdata = hyperdata
session.add(self)
session.commit()
return self['status']
......@@ -18,9 +18,6 @@ class ModelCache(dict):
if preload:
self.preload()
def __del__(self):
session.close()
def __missing__(self, key):
formatted_key = None
conditions = []
......
......@@ -4,7 +4,7 @@ from gargantext.util import http
def save(contents, name='', basedir=''):
digest = str_digest(contents)
digest = str_digest(contents[:4096] + contents[-4096:])
path = basedir
for i in range(2, 8, 2):
path += '/' + digest[:i]
......@@ -17,7 +17,7 @@ def save(contents, name='', basedir=''):
def download(url, name=''):
save(
return save(
contents = http.get(url),
name = name,
basedir = DOWNLOAD_DIRECTORY,
......@@ -30,7 +30,7 @@ def upload(uploaded):
uploaded.size,
UPLOAD_LIMIT,
))
save(
return save(
contents = uploaded.file.read(),
name = uploaded.name,
basedir = UPLOAD_DIRECTORY,
......
from gargantext.constants import *
class Language:
def __init__(self, iso2=None, iso3=None, name=None):
self.iso2 = iso2
self.iso3 = iso3
self.name = name
self.implemented = iso2 in LANGUAGES
def __str__(self):
result = '<Language'
for key, value in self.__dict__.items():
result += ' %s="%s"' % (key, value, )
result += '>'
return result
__repr__ = __str__
class Languages(dict):
def __missing__(self, key):
key = key.lower()
if key in self:
return self[key]
raise KeyError
languages = Languages()
import pycountry
pycountry_keys = (
('iso639_3_code', 'iso3', ),
('iso639_1_code', 'iso2', ),
('name', 'name', ),
('reference_name', None, ),
('inverted_name', None, ),
)
for pycountry_language in pycountry.languages:
language_properties = {}
for pycountry_key, key in pycountry_keys:
if key is not None and hasattr(pycountry_language, pycountry_key):
language_properties[key] = getattr(pycountry_language, pycountry_key)
language = Language(**language_properties)
for pycountry_key, key in pycountry_keys:
if hasattr(pycountry_language, pycountry_key):
languages[getattr(pycountry_language, pycountry_key).lower()] = language
# because PubMed has weird language codes:
languages['fre'] = languages['fr']
languages['ger'] = languages['de']
......@@ -30,7 +30,8 @@ from ._Parser import Parser
class EuropressParser(Parser):
def _parse(self, file):
def parse(self, file):
#print("europr_parser file", file)
localeEncoding = "fr_FR"
......@@ -262,6 +263,7 @@ class EuropressParser(Parser):
except:
raise Exception('Something bad happened.')
if __name__ == "__main__":
e = EuropressFileParser()
hyperdata = e.parse(str(sys.argv[1]))
......
......@@ -6,12 +6,31 @@ from io import BytesIO
class PubmedParser(Parser):
def _parse(self, file):
hyperdata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle',
"abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"title2" : 'MedlineCitation/Article/VernacularTitle',
"language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
"realdate_full_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate',
"realdate_year_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Year',
"realdate_month_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Month',
"realdate_day_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Day',
"publication_year" : 'MedlineCitation/DateCreated/Year',
"publication_month" : 'MedlineCitation/DateCreated/Month',
"publication_day" : 'MedlineCitation/DateCreated/Day',
"authors" : 'MedlineCitation/Article/AuthorList',
}
# xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
def parse(self, file):
# open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
if type(file) == bytes:
if isinstance(file, bytes):
file = BytesIO(file)
xml = etree.parse(file, parser=xml_parser)
xml = etree.parse(file, parser=self.xml_parser)
xml_articles = xml.findall('PubmedArticle')
# initialize the list of hyperdata
hyperdata_list = []
......@@ -19,23 +38,7 @@ class PubmedParser(Parser):
for xml_article in xml_articles:
# extract data from the document
hyperdata = {}
hyperdata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle',
"abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"title2" : 'MedlineCitation/Article/VernacularTitle',
"language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
"realdate_full_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate',
"realdate_year_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Year',
"realdate_month_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Month',
"realdate_day_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Day',
"publication_year" : 'MedlineCitation/DateCreated/Year',
"publication_month" : 'MedlineCitation/DateCreated/Month',
"publication_day" : 'MedlineCitation/DateCreated/Day',
"authors" : 'MedlineCitation/Article/AuthorList',
}
for key, path in hyperdata_path.items():
for key, path in self.hyperdata_path.items():
try:
xml_node = xml_article.find(path)
# Authors tag
......
import collections
import datetime
import dateutil.parser
import zipfile
import re
from gargantext.util.languages import languages
DEFAULT_DATE = datetime.datetime(datetime.MINYEAR, 1, 1)
......@@ -11,8 +12,15 @@ DEFAULT_DATE = datetime.datetime(datetime.MINYEAR, 1, 1)
class Parser:
"""Base class for performing files parsing depending on their type.
"""
def __init__(self, language_cache=None):
self._languages_cache = LanguagesCache() if language_cache is None else language_cache
def __init__(self, file):
if isinstance(file, str):
self._file = open(file, 'rb')
else:
self._file = file
def __del__(self):
self._file.close()
def detect_encoding(self, string):
"""Useful method to detect the encoding of a document.
......@@ -21,7 +29,6 @@ class Parser:
encoding = chardet.detect(string)
return encoding.get('encoding', 'UTF-8')
def format_hyperdata_dates(self, hyperdata):
"""Format the dates found in the hyperdata.
Examples:
......@@ -37,7 +44,6 @@ class Parser:
date_string = hyperdata.get('publication_date_to_parse', None)
if date_string is not None:
date_string = re.sub(r'\/\/+(\w*|\d*)', '', date_string)
#date_string = re.sub(r'undefined', '', date_string)
try:
hyperdata['publication' + "_date"] = dateutil.parser.parse(
date_string,
......@@ -94,17 +100,25 @@ class Parser:
def format_hyperdata_languages(self, hyperdata):
"""format the languages found in the hyperdata."""
language = None
for key in ["fullname", "iso3", "iso2"]:
language_key = "language_" + key
language_keyerrors = {}
for key in ('name', 'iso3', 'iso2', ):
language_key = 'language_' + key
if language_key in hyperdata:
language_symbol = hyperdata[language_key]
language = self._languages_cache[language_symbol]
if language:
break
if language:
hyperdata["language_iso2"] = language.iso2
hyperdata["language_iso3"] = language.iso3
hyperdata["language_fullname"] = language.fullname
try:
language_symbol = hyperdata[language_key]
language = languages[language_symbol]
if language:
break
except KeyError:
language_keyerrors[key] = language_symbol
if language is not None:
hyperdata['language_iso2'] = language.iso2
hyperdata['language_iso3'] = language.iso3
hyperdata['language_name'] = language.name
elif language_keyerrors:
print('Unrecognized language: %s' % ', '.join(
'%s="%s"' % (key, value) for key, value in language_keyerrors.items()
))
return hyperdata
def format_hyperdata(self, hyperdata):
......@@ -113,34 +127,22 @@ class Parser:
hyperdata = self.format_hyperdata_languages(hyperdata)
return hyperdata
def _parse(self, file):
"""This method shall be overriden by inherited classes."""
return list()
def parse(self, file):
def __iter__(self, file=None):
"""Parse the file, and its children files found in the file.
"""
# initialize the list of hyperdata
hyperdata_list = []
if file is None:
file = self._file
# if the file is a ZIP archive, recurse on each of its files...
if zipfile.is_zipfile(file):
# if the file is a ZIP archive, recurse on each of its files...
zipArchive = zipfile.ZipFile(file)
for filename in zipArchive.namelist():
try:
f = zipArchive.open(filename, 'r')
hyperdata_list += self.parse(f)
f.close()
except Exception as error:
print(error)
f = zipArchive.open(filename, 'r')
yield from self.__iter__(f)
f.close()
# ...otherwise, let's parse it directly!
else:
try:
for hyperdata in self._parse(file):
hyperdata_list.append(self.format_hyperdata(hyperdata))
if hasattr(file, 'close'):
file.close()
except Exception as error:
print(error)
# return the list of formatted hyperdata
return hyperdata_list
file.seek(0)
except:pass
for hyperdata in self.parse(file):
yield self.format_hyperdata(hyperdata)
......@@ -19,7 +19,6 @@ def scheduled_thread(func):
return go
from celery import shared_task
def scheduled_celery(func):
"""Provides a decorator to schedule a task with Celery.
......@@ -32,6 +31,9 @@ def scheduled_celery(func):
return go
# scheduled = scheduled_now
# scheduled = scheduled_thread
scheduled = scheduled_celery
from gargantext.settings import DEBUG
if DEBUG == True:
# scheduled = scheduled_now
scheduled = scheduled_thread
else:
scheduled = scheduled_celery
"""This module defines three distinct decorators for scheduling.
"""
def scheduled_now(func):
"""Provides a decorator to execute the task right away.
Mostly useful for debugging purpose.
"""
return func
import threading
def scheduled_thread(func):
"""Provides a decorator to schedule a task as a new thread.
Problem: a shutdown may lose the task forever...
"""
def go(*args, **kwargs):
thread = threading.Thread(target=func, args=args, kwargs=kwargs)
thread.start()
return go
from celery import shared_task
def scheduled_celery(func):
"""Provides a decorator to schedule a task with Celery.
"""
@shared_task
def _func(*args, **kwargs):
func(*args, **kwargs)
def go(*args, **kwargs):
_func.apply_async(args=args, kwargs=kwargs)
return go
# scheduled = scheduled_now
scheduled = scheduled_thread
# scheduled = scheduled_celery
from gargantext.util.db import *
from gargantext.models import *
from gargantext.util.schedule import scheduled
from gargantext.util.scheduling import scheduled
from time import sleep
from gargantext.constants import *
@scheduled
def parse(corpus_id):
print('CORPUS #%d...' % (corpus_id, ))
# retrieve corpus from database
corpus = session.query(Node).filter(Node.id == corpus_id).first()
sleep(2)
if corpus is None:
print('NO SUCH CORPUS: #%d' % corpus_id)
return
print('CORPUS #%d: %s' % (corpus_id, corpus, ))
# retrieve resource information
documents_count = 0
for resource in corpus['resources']:
# information about the resource
resource_parser = RESOURCETYPES[resource['type']]['parser']
resource_path = resource['path']
# extract and insert documents from corpus resource into database
for hyperdata in resource_parser(resource_path):
document = corpus.add_child(
typename = 'DOCUMENT',
name = hyperdata.get('title', '')[:255],
hyperdata = hyperdata,
)
session.add(document)
if documents_count % 64 == 0:
corpus.status(action='parsing', progress=documents_count, autocommit=True)
documents_count += 1
# commit all changes
corpus.status(action='parsing', progress=documents_count)
session.commit()
......@@ -2,6 +2,7 @@ from gargantext.util import workflow
from gargantext.util.http import *
from gargantext.util.db import *
from gargantext.util.db_cache import cache
from gargantext.util.files import upload
from gargantext.models import *
from gargantext.constants import *
......@@ -81,34 +82,40 @@ def project(request, project_id):
# new corpus
if request.method == 'POST':
corpus = project.add_corpus(
corpus = project.add_child(
name = request.POST['name'],
resource_type = request.POST['type'],
resource_upload = request.FILES['file'],
typename = 'CORPUS',
)
corpus.add_resource(
type = int(request.POST['type']),
path = upload(request.FILES['file']),
)
session.add(corpus)
session.commit()
workflow.parse(corpus.id)
# corpora within this project
corpora = project.children('CORPUS').all()
corpora_by_source = defaultdict(list)
sourcename2corpora = defaultdict(list)
for corpus in corpora:
resource_type = RESOURCETYPES[corpus['resource_type']]
corpora_by_source[resource_type['name']].append(corpus)
# we only consider the first resource of the corpus to determine its type
resource = corpus.resources()[0]
resource_type = RESOURCETYPES[resource['type']]
sourcename2corpora[resource_type['name']].append(corpus)
# source & their respective counts
total_count = 0
sources_counts = defaultdict(int)
for document in corpora:
source = RESOURCETYPES[document['resource_type']]
sourcename = re.sub(' \(.*$', '', source['name'])
count = document.children('DOCUMENT').count()
sources_counts[sourcename] += count
count += total_count
total_documentscount = 0
sourcename2documentscount = defaultdict(int)
for sourcename, corpora in sourcename2corpora.items():
sourcename = re.sub(' \(.*$', '', sourcename)
count = corpus.children('DOCUMENT').count()
sourcename2documentscount[sourcename] += count
total_documentscount += count
donut = [
{ 'source': sourcename,
'count': count,
'part' : round(count * 100.0 / total_count) if total_count else 0,
'part' : round(count * 100.0 / total_documentscount, 1) if total_documentscount else 0,
}
for sourcename, count in sources_counts.items()
for sourcename, count in sourcename2documentscount.items()
]
# response!
return render(
......@@ -120,7 +127,7 @@ def project(request, project_id):
'date': datetime.now(),
'project': project,
'donut': donut,
'list_corpora': dict(corpora_by_source),
'list_corpora': dict(sourcename2corpora),
'whitelists': [],
'blacklists': [],
'cooclists': [],
......
......@@ -15,6 +15,7 @@ jdatetime==1.7.2
kombu==3.0.33
lxml==3.5.0
psycopg2==2.6.1
pycountry==1.20
python-dateutil==2.4.2
pytz==2015.7
six==1.10.0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment