[FEAT] uploading projects is now functional

(documents are parsed from resources, but ngrams are not extracted yet)

[FEAT] uploading projects is now functional
(documents are parsed from resources, but ngrams are not extracted yet)
1f000317 · Mathieu Rodic · bd991379 · 1f000317 · bd991379 · 1f000317
Commit 1f000317 authored Feb 16, 2016 by Mathieu Rodic
14 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -10,9 +10,12 @@ NODETYPES = [
 ]

 LANGUAGES = {
-    # 'fr': {
-    #     'tagger': FrenchNgramsTagger
-    # }
+    'fr': {
+        # 'tagger': FrenchNgramsTagger
+    },
+    'en': {
+        # 'tagger': EnglishNgramsTagger
+    },
 }


@@ -66,5 +69,5 @@ QUERY_SIZE_N_DEFAULT = 1000
 import os
 from .settings import BASE_DIR
 UPLOAD_DIRECTORY = os.path.join(BASE_DIR, 'uploads')
-UPLOAD_LIMIT = 16 * 1024 * 1024
+UPLOAD_LIMIT = 1024 * 1024 * 1024
 DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
--- a/gargantext/models/languages.py
+++ b/gargantext/models/languages.py
--- a/gargantext/models/nodes.py
+++ b/gargantext/models/nodes.py
@@ -33,6 +33,10 @@ class Node(Base):
    # metadata
    hyperdata = Column(JSONB, default={})

+    def __init__(self, **kwargs):
+        Base.__init__(self, **kwargs)
+        self.hyperdata = {}
+
    def __getitem__(self, key):
        return self.hyperdata[key]

@@ -48,26 +52,37 @@ class Node(Base):
            query = query.filter(Node.typename == typename)
        return query

-    def add_child(self, typename, **kwargs):
+    def add_child(self, **kwargs):
        """Create and return a new direct child of the current node.
        """
        return Node(
            user_id = self.user_id,
-            typename = typename,
            parent_id = self.id,
            **kwargs
        )

-    def add_corpus(self, name, resource_type, resource_upload=None, resource_url=None):
-        if resource_upload is not None:
-            resource_path = upload(resource_upload)
+    def resources(self):
+        if 'resources' not in self.hyperdata:
+            self.hyperdata['resources'] = []
+        return self['resources']
+
+    def add_resource(self, type, path=None, url=None):
+        self.resources().append({'type': type, 'path':path, 'url':url})
+
+    def status(self, action=None, progress=None, autocommit=False):
+        if 'status' not in self.hyperdata:
+            self['status'] = {'action': action, 'progress': progress}
        else:
-            resource_path = None
-        corpus = self.add_child('CORPUS', name=name, hyperdata={
-            'resource_type': int(resource_type),
-            'resource_path': resource_path,
-            'resource_url': resource_url,
-        })
-        session.add(corpus)
-        session.commit()
-        return corpus
+            if action is not None:
+                self['status']['action'] = action
+            if progress is not None:
+                self['status']['progress'] = progress
+        if autocommit:
+            hyperdata = self.hyperdata.copy()
+            self.hyperdata = None
+            session.add(self)
+            session.commit()
+            self.hyperdata = hyperdata
+            session.add(self)
+            session.commit()
+        return self['status']
--- a/gargantext/util/db_cache.py
+++ b/gargantext/util/db_cache.py
@@ -18,9 +18,6 @@ class ModelCache(dict):
        if preload:
            self.preload()

-    def __del__(self):
-        session.close()
-
    def __missing__(self, key):
        formatted_key = None
        conditions = []

--- a/gargantext/util/files.py
+++ b/gargantext/util/files.py
@@ -4,7 +4,7 @@ from gargantext.util import http


 def save(contents, name='', basedir=''):
-    digest = str_digest(contents)
+    digest = str_digest(contents[:4096] + contents[-4096:])
    path = basedir
    for i in range(2, 8, 2):
        path += '/' + digest[:i]
@@ -17,7 +17,7 @@ def save(contents, name='', basedir=''):


 def download(url, name=''):
-    save(
+    return save(
        contents = http.get(url),
        name = name,
        basedir = DOWNLOAD_DIRECTORY,
@@ -30,7 +30,7 @@ def upload(uploaded):
            uploaded.size,
            UPLOAD_LIMIT,
        ))
-    save(
+    return save(
        contents = uploaded.file.read(),
        name = uploaded.name,
        basedir = UPLOAD_DIRECTORY,

--- a/gargantext/util/languages.py
+++ b/gargantext/util/languages.py
+from gargantext.constants import *
+
+
+
+class Language:
+    def __init__(self, iso2=None, iso3=None, name=None):
+        self.iso2 = iso2
+        self.iso3 = iso3
+        self.name = name
+        self.implemented = iso2 in LANGUAGES
+    def __str__(self):
+        result = '<Language'
+        for key, value in self.__dict__.items():
+            result += ' %s="%s"' % (key, value, )
+        result += '>'
+        return result
+    __repr__ = __str__
+
+class Languages(dict):
+    def __missing__(self, key):
+        key = key.lower()
+        if key in self:
+            return self[key]
+        raise KeyError
+
+languages = Languages()
+
+import pycountry
+pycountry_keys = (
+    ('iso639_3_code', 'iso3', ),
+    ('iso639_1_code', 'iso2', ),
+    ('name', 'name', ),
+    ('reference_name', None, ),
+    ('inverted_name', None, ),
+)
+
+for pycountry_language in pycountry.languages:
+    language_properties = {}
+    for pycountry_key, key in pycountry_keys:
+        if key is not None and hasattr(pycountry_language, pycountry_key):
+            language_properties[key] = getattr(pycountry_language, pycountry_key)
+    language = Language(**language_properties)
+    for pycountry_key, key in pycountry_keys:
+        if hasattr(pycountry_language, pycountry_key):
+            languages[getattr(pycountry_language, pycountry_key).lower()] = language
+
+# because PubMed has weird language codes:
+languages['fre'] = languages['fr']
+languages['ger'] = languages['de']
--- a/gargantext/util/parsers/Europress.py
+++ b/gargantext/util/parsers/Europress.py
@@ -30,7 +30,8 @@ from ._Parser import Parser


 class EuropressParser(Parser):
-    def _parse(self, file):
+
+    def parse(self, file):
        #print("europr_parser file", file)

        localeEncoding          = "fr_FR"
@@ -262,6 +263,7 @@ class EuropressParser(Parser):
        except:
            raise Exception('Something bad happened.')

+
 if __name__ == "__main__":
    e = EuropressFileParser()
    hyperdata = e.parse(str(sys.argv[1]))

--- a/gargantext/util/parsers/Pubmed.py
+++ b/gargantext/util/parsers/Pubmed.py
@@ -6,12 +6,31 @@ from io import BytesIO

 class PubmedParser(Parser):

-    def _parse(self, file):
+    hyperdata_path = {
+        "journal"           : 'MedlineCitation/Article/Journal/Title',
+        "title"             : 'MedlineCitation/Article/ArticleTitle',
+        "abstract"          : 'MedlineCitation/Article/Abstract/AbstractText',
+        "title2"            : 'MedlineCitation/Article/VernacularTitle',
+        "language_iso3"     : 'MedlineCitation/Article/Language',
+        "doi"               : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
+        "realdate_full_"     : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate',
+        "realdate_year_"     : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Year',
+        "realdate_month_"    : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Month',
+        "realdate_day_"      : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Day',
+        "publication_year"  : 'MedlineCitation/DateCreated/Year',
+        "publication_month" : 'MedlineCitation/DateCreated/Month',
+        "publication_day"   : 'MedlineCitation/DateCreated/Day',
+        "authors"           : 'MedlineCitation/Article/AuthorList',
+    }
+
+    # xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
+    xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
+
+    def parse(self, file):
        # open the file as XML
-        xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
-        if type(file) == bytes:
+        if isinstance(file, bytes):
            file = BytesIO(file)
-        xml = etree.parse(file, parser=xml_parser)
+        xml = etree.parse(file, parser=self.xml_parser)
        xml_articles = xml.findall('PubmedArticle')
        # initialize the list of hyperdata
        hyperdata_list = []
@@ -19,23 +38,7 @@ class PubmedParser(Parser):
        for xml_article in xml_articles:
            # extract data from the document
            hyperdata = {}
-            hyperdata_path = {
-                "journal"           : 'MedlineCitation/Article/Journal/Title',
-                "title"             : 'MedlineCitation/Article/ArticleTitle',
-                "abstract"          : 'MedlineCitation/Article/Abstract/AbstractText',
-                "title2"            : 'MedlineCitation/Article/VernacularTitle',
-                "language_iso3"     : 'MedlineCitation/Article/Language',
-                "doi"               : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
-                "realdate_full_"     : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate',
-                "realdate_year_"     : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Year',
-                "realdate_month_"    : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Month',
-                "realdate_day_"      : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Day',
-                "publication_year"  : 'MedlineCitation/DateCreated/Year',
-                "publication_month" : 'MedlineCitation/DateCreated/Month',
-                "publication_day"   : 'MedlineCitation/DateCreated/Day',
-                "authors"           : 'MedlineCitation/Article/AuthorList',
-            }
-            for key, path in hyperdata_path.items():
+            for key, path in self.hyperdata_path.items():
                try:
                    xml_node = xml_article.find(path)
                    # Authors tag

--- a/gargantext/util/parsers/_Parser.py
+++ b/gargantext/util/parsers/_Parser.py
-import collections
 import datetime
 import dateutil.parser
 import zipfile
 import re

+from gargantext.util.languages import languages
+

 DEFAULT_DATE = datetime.datetime(datetime.MINYEAR, 1, 1)

@@ -11,8 +12,15 @@ DEFAULT_DATE = datetime.datetime(datetime.MINYEAR, 1, 1)
 class Parser:
    """Base class for performing files parsing depending on their type.
    """
-    def __init__(self, language_cache=None):
-        self._languages_cache = LanguagesCache() if language_cache is None else language_cache
+
+    def __init__(self, file):
+        if isinstance(file, str):
+            self._file = open(file, 'rb')
+        else:
+            self._file = file
+
+    def __del__(self):
+        self._file.close()

    def detect_encoding(self, string):
        """Useful method to detect the encoding of a document.
@@ -21,7 +29,6 @@ class Parser:
        encoding = chardet.detect(string)
        return encoding.get('encoding', 'UTF-8')

-
    def format_hyperdata_dates(self, hyperdata):
        """Format the dates found in the hyperdata.
        Examples:
@@ -37,7 +44,6 @@ class Parser:
        date_string = hyperdata.get('publication_date_to_parse', None)
        if date_string is not None:
            date_string = re.sub(r'\/\/+(\w*|\d*)', '', date_string)
-            #date_string = re.sub(r'undefined', '', date_string)
            try:
                hyperdata['publication' + "_date"] = dateutil.parser.parse(
                    date_string,
@@ -94,17 +100,25 @@ class Parser:
    def format_hyperdata_languages(self, hyperdata):
        """format the languages found in the hyperdata."""
        language = None
-        for key in ["fullname", "iso3", "iso2"]:
-            language_key = "language_" + key
+        language_keyerrors = {}
+        for key in ('name', 'iso3', 'iso2', ):
+            language_key = 'language_' + key
            if language_key in hyperdata:
-                language_symbol = hyperdata[language_key]
-                language = self._languages_cache[language_symbol]
-                if language:
-                    break
-        if language:
-            hyperdata["language_iso2"]       = language.iso2
-            hyperdata["language_iso3"]       = language.iso3
-            hyperdata["language_fullname"]   = language.fullname
+                try:
+                    language_symbol = hyperdata[language_key]
+                    language = languages[language_symbol]
+                    if language:
+                        break
+                except KeyError:
+                    language_keyerrors[key] = language_symbol
+        if language is not None:
+            hyperdata['language_iso2'] = language.iso2
+            hyperdata['language_iso3'] = language.iso3
+            hyperdata['language_name'] = language.name
+        elif language_keyerrors:
+            print('Unrecognized language: %s' % ', '.join(
+                '%s="%s"' % (key, value) for key, value in language_keyerrors.items()
+            ))
        return hyperdata

    def format_hyperdata(self, hyperdata):
@@ -113,34 +127,22 @@ class Parser:
        hyperdata = self.format_hyperdata_languages(hyperdata)
        return hyperdata

-
-    def _parse(self, file):
-        """This method shall be overriden by inherited classes."""
-        return list()
-
-    def parse(self, file):
+    def __iter__(self, file=None):
        """Parse the file, and its children files found in the file.
        """
-        # initialize the list of hyperdata
-        hyperdata_list = []
+        if file is None:
+            file = self._file
+        # if the file is a ZIP archive, recurse on each of its files...
        if zipfile.is_zipfile(file):
-            # if the file is a ZIP archive, recurse on each of its files...
            zipArchive = zipfile.ZipFile(file)
            for filename in zipArchive.namelist():
-                try:
-                    f = zipArchive.open(filename, 'r')
-                    hyperdata_list += self.parse(f)
-                    f.close()
-                except Exception as error:
-                    print(error)
+                f = zipArchive.open(filename, 'r')
+                yield from self.__iter__(f)
+                f.close()
        # ...otherwise, let's parse it directly!
        else:
            try:
-                for hyperdata in self._parse(file):
-                    hyperdata_list.append(self.format_hyperdata(hyperdata))
-                if hasattr(file, 'close'):
-                    file.close()
-            except Exception as error:
-                print(error)
-        # return the list of formatted hyperdata
-        return hyperdata_list
+                file.seek(0)
+            except:pass
+            for hyperdata in self.parse(file):
+                yield self.format_hyperdata(hyperdata)
--- a/gargantext/util/schedule.py
+++ b/gargantext/util/schedule.py
@@ -19,7 +19,6 @@ def scheduled_thread(func):
    return go


-
 from celery import shared_task
 def scheduled_celery(func):
    """Provides a decorator to schedule a task with Celery.
@@ -32,6 +31,9 @@ def scheduled_celery(func):
    return go


-# scheduled = scheduled_now
-# scheduled = scheduled_thread
-scheduled = scheduled_celery
+from gargantext.settings import DEBUG
+if DEBUG == True:
+    # scheduled = scheduled_now
+    scheduled = scheduled_thread
+else:
+    scheduled = scheduled_celery
--- a/gargantext/util/scheduling.py
+++ b/gargantext/util/scheduling.py
+"""This module defines three distinct decorators for scheduling.
+"""
+
+def scheduled_now(func):
+    """Provides a decorator to execute the task right away.
+    Mostly useful for debugging purpose.
+    """
+    return func
+
+
+import threading
+def scheduled_thread(func):
+    """Provides a decorator to schedule a task as a new thread.
+    Problem: a shutdown may lose the task forever...
+    """
+    def go(*args, **kwargs):
+        thread = threading.Thread(target=func, args=args, kwargs=kwargs)
+        thread.start()
+    return go
+
+
+from celery import shared_task
+def scheduled_celery(func):
+    """Provides a decorator to schedule a task with Celery.
+    """
+    @shared_task
+    def _func(*args, **kwargs):
+        func(*args, **kwargs)
+    def go(*args, **kwargs):
+        _func.apply_async(args=args, kwargs=kwargs)
+    return go
+
+
+# scheduled = scheduled_now
+scheduled = scheduled_thread
+# scheduled = scheduled_celery
--- a/gargantext/util/workflow.py
+++ b/gargantext/util/workflow.py
 from gargantext.util.db import *
 from gargantext.models import *
-from gargantext.util.schedule import scheduled
+from gargantext.util.scheduling import scheduled

-from time import sleep
+from gargantext.constants import *


 @scheduled
 def parse(corpus_id):
-    print('CORPUS #%d...' % (corpus_id, ))
+    # retrieve corpus from database
    corpus = session.query(Node).filter(Node.id == corpus_id).first()
-    sleep(2)
    if corpus is None:
        print('NO SUCH CORPUS: #%d' % corpus_id)
        return
-    print('CORPUS #%d: %s' % (corpus_id, corpus, ))
+    # retrieve resource information
+    documents_count = 0
+    for resource in corpus['resources']:
+        # information about the resource
+        resource_parser = RESOURCETYPES[resource['type']]['parser']
+        resource_path = resource['path']
+        # extract and insert documents from corpus resource into database
+        for hyperdata in resource_parser(resource_path):
+            document = corpus.add_child(
+                typename = 'DOCUMENT',
+                name = hyperdata.get('title', '')[:255],
+                hyperdata = hyperdata,
+            )
+            session.add(document)
+            if documents_count % 64 == 0:
+                corpus.status(action='parsing', progress=documents_count, autocommit=True)
+            documents_count += 1
+    # commit all changes
+    corpus.status(action='parsing', progress=documents_count)
+    session.commit()
--- a/gargantext/views/pages/projects.py
+++ b/gargantext/views/pages/projects.py
@@ -2,6 +2,7 @@ from gargantext.util import workflow
 from gargantext.util.http import *
 from gargantext.util.db import *
 from gargantext.util.db_cache import cache
+from gargantext.util.files import upload
 from gargantext.models import *
 from gargantext.constants import *

@@ -81,34 +82,40 @@ def project(request, project_id):

    # new corpus
    if request.method == 'POST':
-        corpus = project.add_corpus(
+        corpus = project.add_child(
            name = request.POST['name'],
-            resource_type = request.POST['type'],
-            resource_upload = request.FILES['file'],
+            typename = 'CORPUS',
        )
+        corpus.add_resource(
+            type = int(request.POST['type']),
+            path = upload(request.FILES['file']),
+        )
+        session.add(corpus)
+        session.commit()
        workflow.parse(corpus.id)

    # corpora within this project
    corpora = project.children('CORPUS').all()
-    corpora_by_source = defaultdict(list)
+    sourcename2corpora = defaultdict(list)
    for corpus in corpora:
-        resource_type = RESOURCETYPES[corpus['resource_type']]
-        corpora_by_source[resource_type['name']].append(corpus)
+        # we only consider the first resource of the corpus to determine its type
+        resource = corpus.resources()[0]
+        resource_type = RESOURCETYPES[resource['type']]
+        sourcename2corpora[resource_type['name']].append(corpus)
    # source & their respective counts
-    total_count = 0
-    sources_counts = defaultdict(int)
-    for document in corpora:
-        source = RESOURCETYPES[document['resource_type']]
-        sourcename = re.sub(' \(.*$', '', source['name'])
-        count = document.children('DOCUMENT').count()
-        sources_counts[sourcename] += count
-        count += total_count
+    total_documentscount = 0
+    sourcename2documentscount = defaultdict(int)
+    for sourcename, corpora in sourcename2corpora.items():
+        sourcename = re.sub(' \(.*$', '', sourcename)
+        count = corpus.children('DOCUMENT').count()
+        sourcename2documentscount[sourcename] += count
+        total_documentscount += count
    donut = [
        {   'source': sourcename,
            'count': count,
-            'part' : round(count * 100.0 / total_count) if total_count else 0,
+            'part' : round(count * 100.0 / total_documentscount, 1) if total_documentscount else 0,
        }
-        for sourcename, count in sources_counts.items()
+        for sourcename, count in sourcename2documentscount.items()
    ]
    # response!
    return render(
@@ -120,7 +127,7 @@ def project(request, project_id):
            'date': datetime.now(),
            'project': project,
            'donut': donut,
-            'list_corpora': dict(corpora_by_source),
+            'list_corpora': dict(sourcename2corpora),
            'whitelists': [],
            'blacklists': [],
            'cooclists': [],

--- a/requirements.txt
+++ b/requirements.txt
@@ -15,6 +15,7 @@ jdatetime==1.7.2
 kombu==3.0.33
 lxml==3.5.0
 psycopg2==2.6.1
+pycountry==1.20
 python-dateutil==2.4.2
 pytz==2015.7
 six==1.10.0