[FIX] Merge ready for unstable.

aa4e82ee · delanoe · d5d87ef8 · aa4e82ee · aa4e82ee · aa4e82ee
Commit aa4e82ee authored May 27, 2016 by delanoe
8 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -112,18 +112,17 @@ INDEXED_HYPERDATA = {
 }
-#from gargantext.util.taggers import FrenchMeltTagger, TurboTagger
+from gargantext.util.taggers import FrenchMeltTagger, TurboTagger
-from gargantext.util.taggers import NltkTagger
 LANGUAGES = {
    'en': {
        #'tagger': EnglishMeltTagger,
-        #'tagger': TurboTagger,
+        'tagger': TurboTagger,
-        'tagger': NltkTagger,
+        #'tagger': NltkTagger,
    },
    'fr': {
-        #'tagger': FrenchMeltTagger,
+        'tagger': FrenchMeltTagger,
        # 'tagger': TreeTagger,
-        'tagger': NltkTagger,
    },
 }
@@ -131,96 +130,85 @@ LANGUAGES = {
 from gargantext.util.parsers import \
    EuropressParser, RISParser, PubmedParser, ISIParser, CSVParser, ISTexParser, CernParser
-#from gargantext.util.scrappers import \
+def resourcetype(name):
-#    CernScraper
+    '''
+    resourcetype :: String -> Int
+    Usage : resourcetype("Europress (English)") == 1
-def get_resource(corpus_type):
+    Examples in scrapers scripts (Pubmed or ISTex for instance).
-    '''get ressources values for a given ressource_type id'''
+    '''
-    for n in RESOURCETYPES:
+    return [n[0]  for n in enumerate(r['name'] for r in RESOURCETYPES) if n[1] == name][0]
-        if n["type"] == corpus_type:
-            return n
+def resourcename(corpus):
+    '''
+    resourcetype :: Corpus -> String
+    Usage : resourcename(corpus) == "ISTex"
+    '''
+    resource = corpus.resources()[0]
+    resourcename = RESOURCETYPES[resource['type']]['name']
+    return re.sub(r'\(.*', '', resourcename)
 RESOURCETYPES = [
    # type 0
-    {   'type':0,
+    {   'name': 'Select database below',
-        'name': 'Select database below',
        'parser': None,
        'default_language': None,
    },
    # type 1
-    {   'type':1,
+    {   'name': 'Europress (English)',
-        'name': 'Europress (English)',
        'parser': EuropressParser,
        'default_language': 'en',
-        'accepted_formats':["zip",],
    },
    # type 2
-    {   'type':2,
+    {   'name': 'Europress (French)',
-        'name': 'Europress (French)',
        'parser': EuropressParser,
        'default_language': 'fr',
-        'accepted_formats':["zip",],
    },
    # type 3
-    {   'type':3,
+    {   'name': 'Jstor (RIS format)',
-        'name': 'Jstor (RIS format)',
        'parser': RISParser,
        'default_language': 'en',
-        'accepted_formats':["zip",],
    },
    # type 4
-    {   'type':4,
+    {   'name': 'Pubmed (XML format)',
-        'name': 'Pubmed (XML format)',
        'parser': PubmedParser,
        'default_language': 'en',
-        'accepted_formats':["zip",],
    },
    # type 5
-    {   'type':5,
+    {   'name': 'Scopus (RIS format)',
-        'name': 'Scopus (RIS format)',
        'parser': RISParser,
        'default_language': 'en',
-        'accepted_formats':["zip",],
    },
    # type 6
-    {   'type': 6,
+    {   'name': 'Web of Science (ISI format)',
-        'name': 'Web of Science (ISI format)',
        'parser': ISIParser,
        'default_language': 'en',
-        'accepted_formats':["zip",],
    },
    # type 7
-    {   'type':7,
+    {   'name': 'Zotero (RIS format)',
-        'name': 'Zotero (RIS format)',
        'parser': RISParser,
        'default_language': 'en',
-        'accepted_formats':["zip",],
    },
    # type 8
-    {   'type':8,
+    {   'name': 'CSV',
-        'name': 'CSV',
        'parser': CSVParser,
        'default_language': 'en',
-        'accepted_formats':["csv"],
    },
    # type 9
-    {   "type":9,
+    {   'name': 'ISTex',
-        'name': 'ISTex',
        'parser': ISTexParser,
        'default_language': 'en',
-        'accepted_formats':["zip",],
    },
-   {    "type":10,
+    # type 10
-        "name": 'SCOAP (XML MARC21 Format)',
+    {    "type":10,
-        "parser": CernParser,
+         "name": 'SCOAP (XML MARC21 Format)',
-        "default_language": "en",
+         "parser": CernParser,
-        'accepted_formats':["zip","xml"],
+         "default_language": "en",
-        #~ "scrapper": CernScrapper,
+         'accepted_formats':["zip","xml"],
-        #~ "base_url": "http://api.scoap3.org/search?",
+         #~ "scrapper": CernScrapper,
-   },
+         #~ "base_url": "http://api.scoap3.org/search?",
-]
+    },
+]
 # linguistic extraction parameters ---------------------------------------------
 DEFAULT_RANK_CUTOFF_RATIO      = .75         # MAINLIST maximum terms in %
@@ -246,8 +234,8 @@ DEFAULT_ALL_LOWERCASE_FLAG      = True       # lowercase ngrams before recording
                                             #  occurring at sentence beginning)
 # ------------------------------------------------------------------------------
-# other parameters
+# other parameters
 # default number of docs POSTed to scrappers.views.py
 #  (at page  project > add a corpus > scan/process sample)
 QUERY_SIZE_N_DEFAULT = 1000
@@ -257,7 +245,7 @@ from .settings import BASE_DIR
 # uploads/.gitignore prevents corpora indexing
 # copora can be either a folder or symlink towards specific partition
 UPLOAD_DIRECTORY   = os.path.join(BASE_DIR, 'uploads/corpora')
-UPLOAD_LIMIT       = 1024* 1024 * 1024
+UPLOAD_LIMIT       = 1024 * 1024 * 1024
 DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY

--- a/gargantext/models/nodes.py
+++ b/gargantext/models/nodes.py
@@ -110,7 +110,6 @@ class Node(Base):
        if order is not None:
            query = query.order_by(Node.name)
        return query
    def add_child(self, **kwargs):
@@ -136,7 +135,7 @@ class Node(Base):
            self['resources'] = MutableList()
        return self['resources']
-    def add_resource(self, type, path=None, url=None, **kwargs):
+    def add_resource(self, type, path=None, url=None):
        """Attach a resource to a given node.
        Mainly used for corpora.
@@ -146,13 +145,10 @@ class Node(Base):
        {'extracted': True,
          'path': '/home/me/gargantext/uploads/corpora/0c/0c5b/0c5b50/0c5b50ad8ebdeb2ae33d8e54141a52ee_Corpus_Europresse-Français-2015-12-11.zip',
          'type': 1,
-          'url': None,
+          'url': None}
-          'status':
-          'status_message':
-          }
        """
        self.resources().append(MutableDict(
-            {'type': type, 'path':path, 'url':url, 'extracted': False, **kwargs}
+            {'type': type, 'path':path, 'url':url, 'extracted': False}
        ))
    def status(self, action=None, progress=0, complete=False, error=None):

--- a/gargantext/util/http.py
+++ b/gargantext/util/http.py
@@ -16,7 +16,6 @@ def requires_auth(func):
    Also passes the URL to redirect towards as a GET parameter.
    """
    def _requires_auth(request, *args, **kwargs):
-        #print(request.user.is_authenticated())
        if not request.user.is_authenticated():
            url = '/auth/login/?next=%s' % urlencode(request.path)
            return redirect(url)

--- a/gargantext/util/taggers/__init__.py
+++ b/gargantext/util/taggers/__init__.py
-#from .TurboTagger import TurboTagger
+from .TurboTagger import TurboTagger
 from .NltkTagger import NltkTagger
-#from .TreeTagger import TreeTagger
+from .TreeTagger import TreeTagger
-#from .MeltTagger import EnglishMeltTagger, FrenchMeltTagger
+from .MeltTagger import EnglishMeltTagger, FrenchMeltTagger
--- a/gargantext/views/pages/projects.py
+++ b/gargantext/views/pages/projects.py
@@ -2,12 +2,11 @@ from gargantext.util.http import *
 from gargantext.util.db import *
 from gargantext.util.db_cache import cache
 from gargantext.util.files import upload
-from gargantext.util.files import check_format
 from gargantext.models import *
 from gargantext.constants import *
 from gargantext.util.scheduling import scheduled
 from gargantext.util.toolchain import parse_extract_indexhyperdata
-from gargantext.util.toolchain import add_corpus
 from datetime import datetime
 from collections import defaultdict
@@ -18,7 +17,7 @@ import re
 @requires_auth
 def overview(request):
    '''This view show all projects for a given user.
-    Each project is described with hyperdata that are updated on each following view.
+    Each project is described with hyperdata that are updateded on each following view.
    To each project, we can link a resource that can be an image.
    '''
@@ -60,25 +59,17 @@ def overview(request):
 class NewCorpusForm(forms.Form):
-    '''OK: add corpus Form (NIY)'''
    type = forms.ChoiceField(
        choices = enumerate(resource_type['name'] for resource_type in RESOURCETYPES),
        widget = forms.Select(attrs={ 'onchange' :'CustomForSelect( $("option:selected", this).text() );'})
    )
    name = forms.CharField( label='Name', max_length=199 , widget=forms.TextInput(attrs={ 'required': 'true' }))
    file = forms.FileField()
    def clean_file(self):
        file_ = self.cleaned_data.get('file')
-        if len(file_) > UPLOAD_LIMIT : # we don't accept more than 1GB
+        if len(file_) > 1024 ** 3 : # we don't accept more than 1GB
            raise forms.ValidationError(ugettext_lazy('File too heavy! (>1GB).'))
        return file_
-    def check_filename(self):
-        print(self.cleaned_data)
-        print (self.cleaned_data.get("file").split(".")[-1])
-        #if self.cleaned_data.get("file").split(".")[-1] not in RESSOURCETYPES[choices]
-        #print RESOURCETYPES[self.cleaned_data.get("
-        pass
 @requires_auth
@@ -92,55 +83,61 @@ def project(request, project_id):
    if not user.owns(project):
        raise HttpResponseForbidden()
-    # add a new corpus into Node Project > Node Corpus > Ressource
+    # new corpus
    if request.method == 'POST':
+        corpus = project.add_child(
-        corpus = add_corpus(request, project)
+            name = request.POST['name'],
+            typename = 'CORPUS',
-        if corpus.status:
+        )
-            # parse_extract: fileparsing -> ngram extraction -> lists
+        corpus.add_resource(
-            scheduled(parse_extract_indexhyperdata)(corpus.id)
+            type = int(request.POST['type']),
-            return render(
+            path = upload(request.FILES['file']),
-                template_name = 'pages/projects/wait.html',
+        )
-                request = request,
+        session.add(corpus)
-                context = {
+        session.commit()
+        # parse_extract: fileparsing -> ngram extraction -> lists
+        scheduled(parse_extract_indexhyperdata)(corpus.id)
+        return render(
+            template_name = 'pages/projects/wait.html',
+            request = request,
+            context = {
                'user'   : request.user,
                'project': project,
-                },
+            },
-            )
+        )
-    # list all the corpora within this project
+    # corpora within this project
    corpora = project.children('CORPUS', order=True).all()
-    #print(corpora)
    sourcename2corpora = defaultdict(list)
    for corpus in corpora:
        # we only consider the first resource of the corpus to determine its type
        resources = corpus.resources()
-        if len(resources) > 0:
+        if len(resources):
            resource = resources[0]
-            resource= get_resource(resource["type"])
+            resource_type_name = RESOURCETYPES[resource['type']]['name']
-            ##here map from RESSOURCES_TYPES_ID and NOT NAME
+        else:
-            resource_type_name = resource['name']
+            print("(WARNING) PROJECT view: no listed resource")
-            resource_type_accepted_formats = resource['accepted_formats']
+        # add some data for the viewer
+        corpus.count = corpus.children('DOCUMENT').count()
-            # add some data for the viewer
+        status = corpus.status()
-            corpus.count = corpus.children('DOCUMENT').count()
+        if status is not None and not status['complete']:
-            status = corpus.status()
+            if not status['error']:
-            if status is not None and not status['complete']:
+                corpus.status_message = '(in progress: %s, %d complete)' % (
-                if not status['error']:
+                    status['action'].replace('_', ' '),
-                    corpus.status_message = '(in progress: %s, %d complete)' % (
+                    status['progress'],
-                        status['action'].replace('_', ' '),
+                )
-                        status['progress'],
-                    )
-                else:
-                    corpus.status_message = '(aborted: "%s" after %i docs)' % (
-                        status['error'][-1],
-                        status['progress']
-                    )
            else:
-                corpus.status_message = ''
+                corpus.status_message = '(aborted: "%s" after %i docs)' % (
-            # add
+                    status['error'][-1],
-            sourcename2corpora[resource_type_name].append(corpus)
+                    status['progress']
+                )
+        else:
+            corpus.status_message = ''
+        # add
+        sourcename2corpora[resource_type_name].append(corpus)
    # source & their respective counts
    total_documentscount = 0
    sourcename2documentscount = defaultdict(int)

--- a/gargantext/views/pages/terms.py
+++ b/gargantext/views/pages/terms.py
@@ -2,7 +2,7 @@ from gargantext.util.http     import requires_auth, render, settings
 from gargantext.util.db       import session
 from gargantext.util.db_cache import cache
 from gargantext.models        import Node
-from gargantext.constants     import get_resource
+from gargantext.constants     import resourcename
 from datetime                 import datetime
 @requires_auth
@@ -32,7 +32,7 @@ def ngramtable(request, project_id, corpus_id):
            'date': datetime.now(),
            'project': project,
            'corpus' : corpus,
-            'resourcename' : get_ressource(corpus)["name"],
+            'resourcename' : resourcename(corpus),
            'view': 'terms'
        },
    )
--- a/moissonneurs/istex.py
+++ b/moissonneurs/istex.py
@@ -8,7 +8,7 @@ from traceback                  import print_tb
 from django.shortcuts import redirect, render
 from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
-from gargantext.constants       import QUERY_SIZE_N_MAX
+from gargantext.constants       import resourcetype, QUERY_SIZE_N_MAX
 from gargantext.models.nodes    import Node
 from gargantext.util.db         import session
 from gargantext.util.http       import JsonHttpResponse
@@ -133,7 +133,7 @@ def save(request , project_id):
            if filename!=False:
                # add the uploaded resource to the corpus
                corpus.add_resource(
-                  type = 9
+                  type = resourcetype('ISTex')
                , path = filename
                                   )
                dwnldsOK+=1

--- a/moissonneurs/pubmed.py
+++ b/moissonneurs/pubmed.py
@@ -18,7 +18,7 @@ from traceback                  import print_tb
 from django.shortcuts import redirect
 from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
-from gargantext.constants       import get_resource, QUERY_SIZE_N_MAX
+from gargantext.constants       import resourcetype, QUERY_SIZE_N_MAX
 from gargantext.models.nodes    import Node
 from gargantext.util.db         import session
 from gargantext.util.db_cache   import cache
@@ -134,7 +134,7 @@ def save( request , project_id ) :
            print(filename)
            if filename != False:
                # add the uploaded resource to the corpus
-                corpus.add_resource( type = 4
+                corpus.add_resource( type = resourcetype('Pubmed (XML format)')
                                   , path = filename
                                   , url  = None
                                   )