[UPDATE] pull unstable and explorer view OK

b82a67ad · PkSM3 · 6b2cbdd1 · 7af355a0 · b82a67ad · b82a67ad
Commit b82a67ad authored Apr 15, 2015 by PkSM3
47 changed files
--- a/analysis/diachronic_specificity.py
+++ b/analysis/diachronic_specificity.py
@@ -25,7 +25,7 @@ def get_session():
    from aldjemy.core import get_engine
    alias = 'default'
    connection = connections[alias]
-    engine = create_engine("postgresql+psycopg2://alexandre:C8kdcUrAQy66U@localhost/gargandb",
+    engine = create_engine("postgresql+psycopg2://gargantua:C8kdcUrAQy66U@localhost/gargandb",
            use_native_hstore=True)
    Session = sessionmaker(bind=engine)
    return Session()

--- a/analysis/functions.py
+++ b/analysis/functions.py
@@ -249,35 +249,3 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
    #print(data)
    return data

-
-from analysis.tfidf import tfidf
-
-def do_tfidf(corpus, reset=True):
-    # print("=========== doing tfidf ===========")
-    with transaction.atomic():
-        if reset==True:
-            NodeNodeNgram.objects.filter(nodex=corpus).delete()
-        
-        if isinstance(corpus, Node) and corpus.type.name == "Corpus":
-            # print("\n- - - - - - - - - - ")
-            # # for i in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
-            for document in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
-                # print("the doc:",document)
-                for node_ngram in Node_Ngram.objects.filter(node=document):
-                    # print("\tngram:",node_ngram.ngram)
-                    try:
-                        nnn = NodeNodeNgram.objects.get(nodex=corpus, nodey=document, ngram=node_ngram.ngram)
-                        # print("\t\tTRY")
-                    except:
-                        score = tfidf(corpus, document, node_ngram.ngram)
-                        nnn = NodeNodeNgram(nodex=corpus, nodey=node_ngram.node, ngram=node_ngram.ngram, score=score)
-                        nnn.save()
-                        # print("\t\t",node_ngram.ngram," : ",score)
-            # print("- - - - - - - - - - \n")
-        else:
-            print("Only corpus implemented yet, you put instead:", type(corpus))
-
-
-
-
-
--- a/analysis/tfidf.py
+++ b/analysis/tfidf.py
@@ -21,7 +21,7 @@ def get_session():
    from aldjemy.core import get_engine
    alias = 'default'
    connection = connections[alias]
-    engine = create_engine("postgresql+psycopg2://alexandre:C8kdcUrAQy66U@localhost/gargandb",
+    engine = create_engine("postgresql+psycopg2://gargantua:C8kdcUrAQy66U@localhost/gargandb",
            use_native_hstore=True)
    Session = sessionmaker(bind=engine)
    return Session()

--- a/gargantext_web/__init__.py
+++ b/gargantext_web/__init__.py
+#from .celery import app as async_app
--- a/gargantext_web/api.py
+++ b/gargantext_web/api.py
@@ -11,9 +11,9 @@ from sqlalchemy import text, distinct
 from sqlalchemy.sql import func
 from sqlalchemy.orm import aliased

-
+from gargantext_web.views import move_to_trash
 from .db import *
-
+from node import models

 def DebugHttpResponse(data):
    return HttpResponse('<html><body style="background:#000;color:#FFF"><pre>%s</pre></body></html>' % (str(data), ))
@@ -47,10 +47,14 @@ _ngrams_order_columns = {
 }


+
+from rest_framework.authentication import SessionAuthentication, BasicAuthentication
+from rest_framework.permissions import IsAuthenticated
 from rest_framework.views import APIView
 from rest_framework.response import Response
 from rest_framework.exceptions import APIException as _APIException

+
 class APIException(_APIException):
    def __init__(self, message, code=500):
        self.status_code = code
@@ -200,7 +204,7 @@ class NodesChildrenDuplicates(APIView):
        count = len(duplicate_nodes)
        for node in duplicate_nodes:
            print("deleting node ",node.id)
-            node.delete()
+            move_to_trash(node.id)
        # print(delete_query)
        # # delete_query.delete(synchronize_session=True)
        # session.flush()
@@ -552,11 +556,13 @@ class NodesChildrenQueries(APIView):


 class NodesList(APIView):
+    authentication_classes = (SessionAuthentication, BasicAuthentication)

    def get(self, request):
+        print("user id : " + str(request.user))
        query = (session
            .query(Node.id, Node.name, NodeType.name.label('type'))
-            .filter(Node.user_id == request.session._session_cache['_auth_user_id'])
+            .filter(Node.user_id == int(request.user.id))
            .join(NodeType)
        )
        if 'type' in request.GET:
@@ -579,6 +585,8 @@ class Nodes(APIView):
        return JsonHttpResponse({
            'id': node.id,
            'name': node.name,
+            'parent_id': node.parent_id,
+            'type': cache.NodeType[node.type_id].name,
            # 'type': node.type__name,
            #'metadata': dict(node.metadata),
            'metadata': node.metadata,
@@ -589,13 +597,19 @@ class Nodes(APIView):
    # it should take the subnodes into account as well,
    # for better constistency...
    def delete(self, request, node_id):
+        
+        user = request.user
        node = session.query(Node).filter(Node.id == node_id).first()
-        msgres = ""
+        
+        msgres = str()
+        
        try:
-            node.delete()
-            msgres = node_id+" deleted!"
-        except:
-            msgres ="error deleting: "+node_id
+            
+            move_to_trash(node_id)
+            msgres = node_id+" moved to Trash"
+        
+        except Exception as error:
+            msgres ="error deleting : " + node_id + str(error)

        return JsonHttpResponse({
            'deleted': msgres,
@@ -611,7 +625,7 @@ class CorpusController:
            raise ValidationError('Corpora are identified by an integer.', 400)
        corpusQuery = session.query(Node).filter(Node.id == corpus_id).first()
        # print(str(corpusQuery))
-        # raise Http404("C'est toujours ça de pris.")
+        # raise Http404("404 error.")
        if not corpusQuery:
            raise Http404("No such corpus: %d" % (corpus_id, ))
        corpus = corpusQuery.first()

--- a/gargantext_web/celery.py
+++ b/gargantext_web/celery.py
+# -*- coding: utf-8 -*-
+
+
+#import os
+#import djcelery
+#
+#from celery import Celery
+#
+#from django.conf import settings
+#
+## set the default Django settings module for the 'celery' program.
+#os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext_web.settings')
+#
+#app = Celery('gargantext_web')
+#
+#
+#app.conf.update(
+#    CELERY_RESULT_BACKEND='djcelery.backends.database:DatabaseBackend',
+#)
+#
+#
+#app.conf.update(
+#    CELERY_RESULT_BACKEND='djcelery.backends.cache:CacheBackend',
+#)
+#
+## Using a string here means the worker will not have to
+## pickle the object when using Windows.
+##app.config_from_object('django.conf:settings')
+#app.autodiscover_tasks(lambda: settings.INSTALLED_APPS)
+#
+from celery import shared_task
+from node import models
+
+#@app.task(bind=True)
+@shared_task
+def debug_task(request):
+    print('Request: {0!r}'.format(request))
+
+from gargantext_web.db import session, Node
+
+@shared_task
+def apply_sum(x, y):
+    print(x+y)
+    print(session.query(Node.name).first())
+
+
+from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf
+
+
+@shared_task
+def apply_workflow(corpus_id):
+    corpus = session.query(Node).filter(Node.id==corpus_id).first()
+
+    parse_resources(corpus)
+    
+    try:
+        print("-" *60)
+        
+        # With Django ORM 
+        corpus_django = models.Node.objects.get(id=corpus_id)
+        corpus_django.metadata['Processing'] = 0
+        corpus_django.save()
+        print("-" *60)
+        
+        #TODO With SLA ORM (KO why?)
+#        corpus.metadata['Processing'] = 0
+#        session.add(corpus)
+#        session.flush()
+
+    except Exception as error:
+        print(error)
+
+       
+    extract_ngrams(corpus, ['title'])
+    compute_tfidf(corpus)
+
+
--- a/gargantext_web/db.py
+++ b/gargantext_web/db.py
@@ -2,6 +2,7 @@ from gargantext_web import settings
 from node import models


+
 __all__ = ['literalquery', 'session', 'cache', 'Session', 'bulk_insert', 'engine', 'get_cursor']


@@ -56,6 +57,23 @@ for model_name, model in models.__dict__.items():
 NodeNgram = Node_Ngram
 NodeResource = Node_Resource

+# manually declare the Node table...
+from datetime import datetime
+from sqlalchemy.types import *
+from sqlalchemy.schema import Column, ForeignKey
+from sqlalchemy.dialects.postgresql import JSONB
+from sqlalchemy.orm import relationship, aliased
+
+# class Node(Base):
+#     __tablename__ = 'node_node'
+#     id = Column(Integer, primary_key=True)
+#     user_id = Column(Integer, ForeignKey('auth_user.id', ondelete='CASCADE'), index=True, nullable=False)
+#     type_id = Column(Integer, ForeignKey('node_nodetype.id', ondelete='CASCADE'), index=True, nullable=False)
+#     name = Column(String(255))
+#     language_id = Column(Integer, ForeignKey('node_language.id', ondelete='CASCADE'), index=True, nullable=False)
+#     date = Column(DateTime(), default=datetime.utcnow, nullable=True)
+#     metadata = Column(JSONB, default={}, nullable=False)
+

 # debugging tool, to translate SQLAlchemy queries to string

@@ -67,7 +85,6 @@ def literalquery(statement, dialect=None):
    purposes only. Executing SQL statements with inline-rendered user values is
    extremely insecure.
    """
-    from datetime import datetime
    import sqlalchemy.orm
    if isinstance(statement, sqlalchemy.orm.Query):
        if dialect is None:

--- a/gargantext_web/settings.py
+++ b/gargantext_web/settings.py
@@ -14,11 +14,33 @@ BASE_DIR = os.path.dirname(os.path.dirname(__file__))
 PROJECT_PATH = os.path.join(BASE_DIR, os.pardir)
 PROJECT_PATH = os.path.abspath(PROJECT_PATH)

+
+######################################################################
+# ASYNCHRONOUS TASKS
+
 import djcelery
 djcelery.setup_loader()
 BROKER_URL = 'amqp://guest:guest@localhost:5672/'
-CELERY_IMPORTS=("node.models",)

+CELERY_IMPORTS=("node.models","gargantext_web.celery")
+
+
+#
+#from celery import Celery
+#
+#app = Celery('gargantext_web')
+#
+#app.conf.update(
+#    CELERY_RESULT_BACKEND='djcelery.backends.database:DatabaseBackend',
+#)
+#
+#
+#app.conf.update(
+#    CELERY_RESULT_BACKEND='djcelery.backends.cache:CacheBackend',
+#)
+#
+
+######################################################################

 # Quick-start development settings - unsuitable for production
 # See https://docs.djangoproject.com/en/1.6/howto/deployment/checklist/
@@ -48,8 +70,16 @@ TEMPLATE_DIRS = (


 #ALLOWED_HOSTS = ['*',]
-ALLOWED_HOSTS = ['localhost', 'master.polemic.be', 'beta.gargantext.org']
-
+ALLOWED_HOSTS = ['localhost', 
+                'gargantext.org', 
+                'stable.gargantext.org', 
+                'dev.gargantext.org', 
+                'iscpif.gargantext.org', 
+                'mines.gargantext.org', 
+                'beta.gargantext.org', 
+                'garg-dev.iscpif.fr', 
+                'garg-stable.iscpif.fr', 
+                ]


 # Application definition
@@ -82,6 +112,16 @@ MIDDLEWARE_CLASSES = (
    'django.middleware.clickjacking.XFrameOptionsMiddleware',
 )

+REST_SESSION_LOGIN = False
+REST_FRAMEWORK = {
+    'DEFAULT_AUTHENTICATION_CLASSES': (
+        'rest_framework.authentication.TokenAuthentication',
+        'rest_framework.authentication.SessionAuthentication',
+    ),
+   'DEFAULT_PERMISSION_CLASSES': (
+        'rest_framework.permissions.AllowAny',
+    ),
+}

 WSGI_APPLICATION = 'wsgi.application'

@@ -93,7 +133,7 @@ DATABASES = {
    'default': {
        'ENGINE': 'django.db.backends.postgresql_psycopg2',
        'NAME': 'gargandb',
-        'USER': 'alexandre',
+        'USER': 'gargantua',
        'PASSWORD': 'C8kdcUrAQy66U',
        #'USER': 'gargantext',
        #'PASSWORD': 'C8krdcURAQy99U',

--- a/gargantext_web/tasks.py
+++ b/gargantext_web/tasks.py
+
+
+from celery import shared_task
+
+from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf
+
+
+@shared_task
+def apply_workflow(corpus):
+    parse_resources(corpus)
+    extract_ngrams(corpus, ['title'])
+    compute_tfidf(corpus)
+
--- a/gargantext_web/urls.py
+++ b/gargantext_web/urls.py
@@ -33,7 +33,7 @@ urlpatterns = patterns('',
    # Project Management
    url(r'^projects/$', views.projects),
    url(r'^project/(\d+)/$', views_optimized.project),
-    url(r'^delete/(\d+)$', views.trash_node), # => api.node('id' = id, children = 'True', copies = False)
+    url(r'^delete/(\d+)$', views.delete_node), # => api.node('id' = id, children = 'True', copies = False)
    
    # Corpus management
    url(r'^project/(\d+)/corpus/(\d+)/$', views.corpus),

--- a/gargantext_web/views.py
+++ b/gargantext_web/views.py
@@ -46,6 +46,8 @@ from sqlalchemy import or_, func

 from gargantext_web import about

+
+
 def login_user(request):
    logout(request)
    username = password = ''
@@ -199,7 +201,6 @@ def home_view(request):
    t = get_template('home.html')
    user = request.user
    date = datetime.datetime.now()
-
    html = t.render(Context({\
            'user': user,\
            'date': date,\
@@ -455,7 +456,21 @@ def empty_trash():
            node.delete()


-def trash_node(request, node_id):
+def move_to_trash(node_id):
+    try:
+        node = session.query(Node).filter(Node.id == node_id).first()
+        
+        previous_type_id = node.type_id
+        node.type_id = cache.NodeType['Trash'].id
+        
+        session.add(node)
+        session.commit()
+        return(previous_type_id)
+    except Exception as error:
+        print("can not move to trash Node" + node_id + ":" + error)
+
+
+def delete_node(request, node_id):
    
    # do we have a valid user?
    user = request.user
@@ -466,52 +481,18 @@ def trash_node(request, node_id):
    if node.user_id != user.id:
        return HttpResponseForbidden()

-    previous_type_id = node.type_id
-    node.type_id = cache.NodeType['Trash'].id
-    session.add(node)
-    session.commit()
-    
-    if previous_type_id == cache.NodeType['Project'].id:
+    previous_type_id = move_to_trash(node_id)
+
+    if previous_type_id == cache.NodeType['Corpus'].id:
+        return HttpResponseRedirect('/project/' + str(node.parent_id))
+    else:
        return HttpResponseRedirect('/projects/')
-    elif previous_type_id == cache.NodeType['Corpus'].id:
-        return HttpResponseRedirect('/project/' + str(session.query(Node.id).filter(Node.id==node.parent_id).first()[0]))
+    

    if settings.DEBUG == True:
        empty_trash()


-def delete_node(request, node_id):
-
-    #nodes = session.query(Node).filter(or_(Node.id == node_id, Node.parent_id == node_id)).all()
-    
-#    try:
-#        resources = session.query(Node_Resource).filter(Node_Resource.node_id==node_id).all()
-#        if resources is not None:
-#            for resource in resources:
-#                session.delete(resource)
-#    
-#    except Exception as error:
-#        print(error)
-#   
-#    node = session.query(Node).filter(Node.id == node_id).first()
-#    if node is not None:
-#        session.delete(node)
-#    session.commit()
-    
-    node = models.Node.objects.get(id=node_id)
-    with transaction.atomic():
-        try:
-            node.children.delete()
-        except Exception as error:
-            print(error)
-
-        node.delete()
-
-    if node.type_id == cache.NodeType['Project'].id:
-        return HttpResponseRedirect('/projects/')
-    elif node.type_id == cache.NodeType['Corpus'].id:
-        return HttpResponseRedirect('/project/' + node_id)
-

 def delete_corpus(request, project_id, node_id):
    # ORM Django

--- a/gargantext_web/views_optimized.py
+++ b/gargantext_web/views_optimized.py
@@ -7,6 +7,7 @@ from sqlalchemy.orm import aliased

 from collections import defaultdict
 from datetime import datetime
+from time import sleep
 from threading import Thread

 from node.admin import CustomForm
@@ -14,14 +15,14 @@ from gargantext_web.db import *
 from gargantext_web.settings import DEBUG, MEDIA_ROOT
 from gargantext_web.api import JsonHttpResponse
 import json
+import re

 from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf


-def project(request, project_id):
+from gargantext_web.celery import apply_workflow

-    # SQLAlchemy session
-    session = Session()
+def project(request, project_id):

    # do we have a valid project id?
    try:
@@ -51,7 +52,7 @@ def project(request, project_id):
    #  ... sqlalchemy.func by Resource.type_id is the guilty
    # ISSUE L51
    corpus_query = (session
-        .query(Node.id, Node.name, func.count(ChildrenNode.id))
+        .query(Node.id, Node.name, func.count(ChildrenNode.id), Node.metadata['Processing'])
        #.query(Node.id, Node.name, Resource.type_id, func.count(ChildrenNode.id))
        #.join(Node_Resource, Node_Resource.node_id == Node.id)
        #.join(Resource, Resource.id == Node_Resource.resource_id)
@@ -66,8 +67,10 @@ def project(request, project_id):
    documents_count_by_resourcetype = defaultdict(int)
    corpora_count = 0
    corpusID_dict = {}
-    for corpus_id, corpus_name, document_count in corpus_query:
-        
+    
+
+    for corpus_id, corpus_name, document_count, processing in corpus_query:
+        #print(corpus_id, processing)
        # Not optimized GOTO ISSUE L51
        resource_type_id = (session.query(Resource.type_id)
                                   .join(Node_Resource, Node_Resource.resource_id == Resource.id)
@@ -82,9 +85,10 @@ def project(request, project_id):
                resourcetype = cache.ResourceType[resource_type_id]
                resourcetype_name = resourcetype.name
            corpora_by_resourcetype[resourcetype_name].append({
-                'id': corpus_id,
-                'name': corpus_name,
-                'count': document_count,
+                'id'        : corpus_id,
+                'name'      : corpus_name,
+                'count'     : document_count,
+                'processing': processing,
            })
            documents_count_by_resourcetype[resourcetype_name] += document_count
            corpora_count += 1
@@ -93,7 +97,7 @@ def project(request, project_id):
    # do the donut
    total_documents_count = sum(documents_count_by_resourcetype.values())
    donut = [
-        {   'source': key, 
+        {   'source': re.sub(' \(.*$', '', key), 
            'count': value,
            'part' : round(value * 100 / total_documents_count) if total_documents_count else 0,
        }
@@ -112,20 +116,21 @@ def project(request, project_id):
            resourcetype = cache.ResourceType[form.cleaned_data['type']]
            
            # which default language shall be used?
-            if resourcetype.name == "europress_french":
+            if resourcetype.name == "Europress (French)":
                language_id = cache.Language['fr'].id
-            elif resourcetype.name == "europress_english":
+            elif resourcetype.name == "Europress (English)":
                language_id = cache.Language['en'].id
            else:
                language_id = None
            
            # corpus node instanciation as a Django model
            corpus = Node(
-                name = name,
-                user_id = request.user.id,
-                parent_id = project_id,
-                type_id = cache.NodeType['Corpus'].id,
+                name        = name,
+                user_id     = request.user.id,
+                parent_id   = project_id,
+                type_id     = cache.NodeType['Corpus'].id,
                language_id = language_id,
+                metadata    = {'Processing' : 1,}
            )
            session.add(corpus)
            session.commit()
@@ -142,25 +147,25 @@ def project(request, project_id):
            )
            # let's start the workflow
            try:
-                def apply_workflow(corpus):
-                    parse_resources(corpus)
-                    extract_ngrams(corpus, ['title'])
-                    compute_tfidf(corpus)
-                if DEBUG:
-                    apply_workflow(corpus)
+                if DEBUG is False:
+                    apply_workflow.apply_async((corpus.id,),)
                else:
-                    thread = Thread(target=apply_workflow, args=(corpus, ), daemon=True)
-                    thread.start()
+                   #apply_workflow(corpus)
+                   thread = Thread(target=apply_workflow, args=(corpus.id, ), daemon=True)
+                   thread.start()
            except Exception as error:
                print('WORKFLOW ERROR')
                print(error)
            # redirect to the main project page
+            # TODO need to wait before response (need corpus update) 
+            sleep(1)
            return HttpResponseRedirect('/project/' + str(project_id))
        else:
            print('ERROR: BAD FORM')
    else:
        form = CustomForm()

+
    # HTML output
    return render(request, 'project.html', {
        'form'          : form,

--- a/init/README.rst
+++ b/init/README.rst
@@ -39,10 +39,10 @@ In PostreSQL

 3)  psql

-4)  CREATE USER alexandre WITH PASSWORD 'C8kdcUrAQy66U';
+4)  CREATE USER gargantua WITH PASSWORD 'C8kdcUrAQy66U';
    (see gargantext_web/settings.py, DATABASES = { ... })
    
-5)  CREATE DATABASE gargandb WITH OWNER alexandre;
+5)  CREATE DATABASE gargandb WITH OWNER gargantua;

 6)  Ctrl + D

@@ -80,7 +80,7 @@ Last steps of configuration
    Warning: for ln, path has to be absolute!

 5)  patch CTE:
-    patch /srv/gargantext_env/lib/python3.4/site-packages/cte_tree/models.py /srv/gargantext/init/cte_tree.models.diff
+    patch /srv/gargantext_env/lib/python3.4/site-packages/cte_tree/models.py /srv/gargantext/init/patches/cte_tree.models.diff

 6)  init nodetypes and main variables
    /srv/gargantext/manage.py shell < /srv/gargantext/init/init.py

--- a/init/dependances.sh
+++ b/init/dependances.sh
-sudo apt-get install postgresql
-sudo apt-get install postgresql-contrib
-sudo apt-get install python-virtualenv
-sudo apt-get install libpng12-dev
-sudo apt-get install libpng-dev
-sudo apt-cache search freetype
-sudo apt-get install libfreetype6-dev
-sudo apt-cache search python-dev
-sudo apt-get install python-dev
-sudo apt-get install libpq-dev
-sudo apt-get postgresql-contrib
-sudo aptèget install libpq-dev
-
-# Pour avoir toutes les dependences de matpolotlib (c'est sale, trouver
-sudo apt-get build-dep python-matplotlib
-#Paquets Debian a installer
-# easy_install -U distribute (matplotlib)
-#lxml
-sudo apt-get install libffi-dev
-sudo apt-get install libxml2-dev
-sudo apt-get install libxslt1-dev
-
-# ipython readline
-sudo apt-get install libncurses5-dev
-sudo apt-get install pandoc
-
-# scipy:
-sudo apt-get install gfortran
-sudo apt-get install libopenblas-dev
-sudo apt-get install liblapack-dev
-
-#nlpserver
-sudo apt-get install libgflags-dev
-sudo aptitude install libgoogle-glog-dev
-
-source /srv/gargantext_env/bin/activate
-pip3 install git+https://github.com/mathieurodic/aldjemy.git
--- a/init/hstore2jsonb.sql
+++ b/init/hstore2jsonb.sql
-ALTER TABLE ONLY node_node
-ALTER COLUMN metadata
-DROP NOT NULL
-;
-
-ALTER TABLE ONLY node_node
-ALTER COLUMN metadata
-DROP DEFAULT
-;
-
-ALTER TABLE ONLY node_node
-ALTER COLUMN metadata
-TYPE JSON
-USING hstore_to_json(metadata)
-;
-
-ALTER TABLE ONLY node_node
-ALTER COLUMN metadata
-SET DEFAULT '{}'::json
-;
-
-
-ALTER TABLE ONLY node_node
-ALTER COLUMN metadata
-SET NOT NULL
-;
-
--- a/init/init.py
+++ b/init/init.py
@@ -104,30 +104,15 @@ except Exception as error:

 # In[33]:

-try:
-    typePubmed = ResourceType.objects.get(name='pubmed')
-    typeIsi    = ResourceType.objects.get(name='isi')
-    typeRis    = ResourceType.objects.get(name='ris')
-    typePresseFrench = ResourceType.objects.get(name='europress_french')
-    typePresseEnglish = ResourceType.objects.get(name='europress_english')
+from parsing.parsers_config import parsers

-except Exception as error:
-    print(error)
-    
-    typePubmed = ResourceType(name='pubmed')
-    typePubmed.save()
-    
-    typeIsi    = ResourceType(name='isi')
-    typeIsi.save()
-    
-    typeRis    = ResourceType(name='ris')
-    typeRis.save()
-    
-    typePresseFrench = ResourceType(name='europress_french')
-    typePresseFrench.save()
-    
-    typePresseEnglish = ResourceType(name='europress_english')
-    typePresseEnglish.save()
+ResourceType.objects.all().delete()
+
+for key in parsers.keys():
+    try:
+        ResourceType.objects.get_or_create(name=key)
+    except Exception as error:
+        print("Ressource Error: ", error)


 # In[34]:

--- a/init/init.sh
+++ b/init/init.sh
@@ -4,6 +4,7 @@ psql -d gargandb -f init.sql

 sleep 2
 ../manage.py syncdb
+
 psql -d gargandb -f init2.sql



--- a/init/init2.sql
+++ b/init/init2.sql
-ALTER TABLE ONLY node_node ALTER COLUMN date SET DEFAULT CURRENT_DATE;
-- ALTER TABLE ONLY node_node ALTER COLUMN metadata SET DEFAULT ''::hstore;
-
-- ALTER TABLE ONLY node_node ALTER COLUMN metadata TYPE JSONB;
-
-
-- ALTER TABLE ONLY node_node ALTER COLUMN metadata SET DEFAULT '{}'::JSONB;
-
--- a/init/init_gargantext.py
+++ b/init/init_gargantext.py
@@ -54,16 +54,13 @@ import pycountry
 Language.objects.all().delete()
 for language in pycountry.languages:
    if 'alpha2' in language.__dict__:
-        Language(
+        models.Language(
            iso2 = language.alpha2,
            iso3 = language.bibliographic,
            fullname = language.name,
            implemented = 1 if language.alpha2 in ['en', 'fr'] else 0,
        ).save()

-english = Language.objects.get(iso2='en')
-french  = Language.objects.get(iso2='fr')
-

 # Integration: users

@@ -99,56 +96,10 @@ for node_type in node_types:

 print('Initialize resource...')

-resources = [
-        'pubmed', 'isi', 'ris', 'europress_french', 'europress_english']
-
-for resource in resources:
-    models.ResourceType.objects.get_or_create(name=resource)
-
-
-
-# TODO 
-# here some tests
-# add a new project and some corpora to test it
-
-
-# Integration: project
-#
-#print('Initialize project...')
-#try:
-#    project = Node.objects.get(name='Bees project')
-#except:
-#    project = Node(name='Bees project', type=typeProject, user=me)
-#    project.save()
-#
-
-# Integration: corpus
-
-#print('Initialize corpus...')
-#try:
-#    corpus_pubmed = Node.objects.get(name='PubMed corpus')
-#except:
-#    corpus_pubmed = Node(parent=project, name='PubMed corpus', type=typeCorpus, user=me)
-#    corpus_pubmed.save()
-#
-#print('Initialize resource...')
-#corpus_pubmed.add_resource(
-#    # file='./data_samples/pubmed.zip',
-#    #file='./data_samples/pubmed_2013-04-01_HoneyBeesBeeBees.xml',
-#    file='/srv/gargantext_lib/data_samples/pubmed.xml',
-#    type=typePubmed,
-#    user=me
-#)
-#
-#for resource in corpus_pubmed.get_resources():
-#    print('Resource #%d - %s - %s' % (resource.id, resource.digest, resource.file))
-#    
-## print('Parse corpus #%d...' % (corpus_pubmed.id, ))
-# corpus_pubmed.parse_resources(verbose=True)
-# print('Extract corpus #%d...' % (corpus_pubmed.id, ))
-# corpus_pubmed.children.all().extract_ngrams(['title',])
-# print('Parsed corpus #%d.' % (corpus_pubmed.id, ))
+from parsing.parsers_config import parsers

+for parser in parsers.keys():
+    models.ResourceType.objects.get_or_create(name=parser)




--- a/init/install/0-PostfixMain.cf
+++ b/init/install/0-PostfixMain.cf
+# See /usr/share/postfix/main.cf.dist for a commented, more complete version
+
+
+# Debian specific:  Specifying a file name will cause the first
+# line of that file to be used as the name.  The Debian default
+# is /etc/mailname.
+#myorigin = /etc/mailname
+
+smtpd_banner = $myhostname ESMTP $mail_name (Debian)
+biff = no
+
+# appending .domain is the MUA's job.
+append_dot_mydomain = no
+
+# Uncomment the next line to generate "delayed mail" warnings
+#delay_warning_time = 4h
+
+readme_directory = no
+
+# TLS parameters
+smtpd_tls_cert_file=/etc/ssl/certs/ssl-cert-snakeoil.pem
+smtpd_tls_key_file=/etc/ssl/private/ssl-cert-snakeoil.key
+smtpd_use_tls=yes
+smtpd_tls_session_cache_database = btree:${data_directory}/smtpd_scache
+smtp_tls_session_cache_database = btree:${data_directory}/smtp_scache
+
+# See /usr/share/doc/postfix/TLS_README.gz in the postfix-doc package for
+# information on enabling SSL in the smtp client.
+
+myhostname = garg-dev.iscpif.fr
+alias_maps = hash:/etc/aliases
+alias_database = hash:/etc/aliases
+myorigin = /etc/mailname
+mydestination = garg-dev.iscpif.fr, localhost.iscpif.fr, , localhost
+relayhost = smtp.iscpif.fr
+mynetworks = 127.0.0.0/8 [::ffff:127.0.0.0]/104 [::1]/128
+mailbox_size_limit = 0
+recipient_delimiter = +
+inet_interfaces = all
--- a/init/install/0-upgradeWheezy.sh
+++ b/init/install/0-upgradeWheezy.sh
+#!/bin/bash
+
+apt-get install sudo
+
+sudo apt-get install postfix
+# copy from tina
+sudo cp 0*cf /etc/postfix/main.cf
+sudo postfix reload
+
+sed -i 's/wheezy/jessie/g' /etc/apt/sources.list
+sudo aptitude update
+sudo aptitude dist-upgrade
+
+# dpkg-reconfigure locales => add GB
+
+ssh-keygen
+
+cat ~/.ssh/id_rsa.pub | mail alexandre@delanoe.org -s "Key Server $(hostname)"
+
+echo "Put ~/.ssh/id_rsa.pub on remote to enable git pull please and press enter"
+read answer
+
+sudo mkdir /srv/gargantext
+cd /srv
+
+chown gargantua:www-data gargantext
+
+git clone ssh orign ssh://gitolite@delanoe.org:1979/gargantext
--- a/init/install/1-dependancesDebian.sh
+++ b/init/install/1-dependancesDebian.sh
+#!/bin/dash
+
+# TODO do apt-get install --force-yes --force-yes 
+
+
+apt-get install --force-yes postgresql
+apt-get install --force-yes postgresql-contrib
+apt-get install --force-yes rabbitmq-server
+apt-get install --force-yes tmux
+apt-get install --force-yes uwsgi uwsgi-plugin-python3
+
+apt-get install --force-yes python3.4-venv
+#apt-get install --force-yes python-virtualenv
+
+apt-get install --force-yes libpng12-dev
+apt-get install --force-yes libpng-dev
+apt-get install --force-yes libfreetype6-dev
+apt-get install --force-yes python-dev
+apt-get install --force-yes libpq-dev
+apt-get install --force-yes libpq-dev
+
+#apt-get build-dep python-matplotlib
+#apt-get install --force-yes python-matplotlib
+
+#Paquets Debian a installer
+# easy_install --force-yes -U distribute (matplotlib)
+#lxml
+apt-get install --force-yes libffi-dev
+apt-get install --force-yes libxml2-dev
+apt-get install --force-yes libxslt1-dev
+
+# ipython readline
+apt-get install --force-yes libncurses5-dev
+apt-get install --force-yes pandoc
+
+# scipy:
+apt-get install --force-yes gfortran
+apt-get install --force-yes libopenblas-dev
+apt-get install --force-yes liblapack-dev
+
+#nlpserver
+apt-get install --force-yes libgflags-dev
+aptitude install --force-yes libgoogle-glog-dev
+
+# MElt
+# soon
+
+## SERVER Configuration
+
+# server configuration
+apt-get install --force-yes nginx
+
+# UWSGI with pcre support
+apt-get install --force-yes libpcre3 libpcre3-dev
+apt-get install --force-yes python3-pip
+pip3 install --force-yes uwsgi
+
--- a/init/install/2-pythonVirtualEnv.sh
+++ b/init/install/2-pythonVirtualEnv.sh
+#!/bin/dash
+
+
+sudo mkdir /srv/gargantext_env
+sudo chown -R gargantua:www-data /srv/gargantext_env
+
+
+pyvenv3 /srv/gargantext_env
+
+source /srv/gargantext_env/bin/activate
+
+pip install --upgrade pip
+pip install -r 3-requirements.txt
+
+pip3 install git+https://github.com/mathieurodic/aldjemy.git
+patch /srv/gargantext_env/lib/python3.4/site-packages/cte_tree/models.py /srv/gargantext/init/patches/cte_tree.models.diff
+
+
+
--- a/init/requirements.txt
+++ b/init/requirements.txt
@@ -5,14 +5,14 @@ MarkupSafe==0.23
 Pillow==2.5.3
 Pygments==1.6
 RandomWords==0.1.12
-SQLAlchemy==0.9.8
+SQLAlchemy==0.9.9
 South==1.0
 aldjemy==0.3.10
 amqp==1.4.6
 anyjson==0.3.3
 bibtexparser==0.6.0
-billiard==3.3.0.18
-celery==3.1.15
+billiard==3.3.0.19
+celery==3.1.17
 certifi==14.05.14
 cffi==0.8.6
 chardet==2.3.0
@@ -36,7 +36,7 @@ djangorestframework==3.0.0
 gensim==0.10.3
 graphviz==0.4
 ipython==2.2.0
-kombu==3.0.23
+kombu==3.0.24
 lxml==3.4.1
 #matplotlib==1.4.0
 networkx==1.9
@@ -52,7 +52,7 @@ pydot2==1.0.33
 pyparsing==2.0.2
 python-dateutil==2.2
 python-igraph==0.7
-pytz==2014.7
+pytz==2015.2
 pyzmq==14.3.1
 readline==6.2.4.1
 redis==2.10.3

--- a/init/install/3-librairies.sh
+++ b/init/install/3-librairies.sh
+#!/bin/bash
+
+
+cd /tmp/
+wget http://docs.delanoe.org/gargantext_lib.tar.bz2
+cd /srv/
+sudo mkdir gargantext_lib
+sudo chown -R gargantua:www-data /srv/gargantext_lib
+
+tar xvjf /tmp/gargantext_lib.tar.bz2
+sudo chown -R gargantua:www-data /srv/gargantext_lib
+
+cd /srv/gargantext_lib/js
+git pull origin master git@github.com:PkSM3/garg.git
+
--- a/init/install/3-postgreSQL.sql
+++ b/init/install/3-postgreSQL.sql
+In PostreSQL
+-------------
+
+1)  Ensure postgres is started: sudo /etc/init.d/postgresql start
+
+2)  sudo su postgres
+
+3)  psql
+
+4)  CREATE USER gargantua WITH PASSWORD 'C8kdcUrAQy66U';
+    (see gargantext_web/settings.py, DATABASES = { ... })
+    
+5)  CREATE DATABASE gargandb WITH OWNER gargantua;
+
+6)  Ctrl + D
+
+7)  psql gargandb
+
+6)  CREATE EXTENSION hstore;
+
+7)  Ctrl + D
+
+
--- a/init/install/4-NGINX_gargantext.conf
+++ b/init/install/4-NGINX_gargantext.conf
+
+# the upstream component nginx needs to connect to
+upstream gargantext {
+    server unix:///tmp/gargantext.sock; # for a file socket
+    #server 127.0.0.1:8001; # for a web port socket (we'll use this first)
+}
+
+# configuration of the server
+server {
+    # the port your site will be served on
+    listen      8002;
+    # the domain name it will serve for
+    server_name localhost; # substitute your machine's IP address or FQDN
+    charset     utf-8;
+
+    # max upload size
+    client_max_body_size 75M;   # adjust to taste
+
+    # Django media
+    location /media  {
+        alias /var/www/gargantext/media;  # your Django project's media files - amend as required
+    }
+
+    location /static {
+        alias /var/www/gargantext/static; # your Django project's static files - amend as required
+    }
+
+    # Finally, send all non-media requests to the Django server.
+    location / {
+        uwsgi_pass  gargantext;
+        include     uwsgi_params;
+    }
+}
+
--- a/init/install/4-UWSGI_gargantext.ini
+++ b/init/install/4-UWSGI_gargantext.ini
+# django.ini file
+[uwsgi]
+
+env = DJANGO_SETTINGS_MODULE=gargantext_web.settings
+#module = django.core.handlers.wsgi:WSGIHandler()
+
+#touch-reload= /tmp/gargantext.reload
+
+# the base directory
+chdir           = /srv/gargantext
+
+# Django's wsgi file
+#module          = wsgi
+wsgi-file       = /srv/gargantext/wsgi.py
+# the virtualenv
+home            = /srv/gargantext_env/
+
+
+# master
+master                  = true
+
+# maximum number of processes
+processes               = 10
+
+# the socket (use the full path to be safe)
+socket          = /tmp/gargantext.sock
+
+# with appropriate permissions - *may* be needed
+chmod-socket    = 666
+
+# clear environment on exit
+vacuum          = true
+
+pidfile = /tmp/gargantext.pid
+
+# respawn processes taking more than 20 seconds
+harakiri = 120
+
+# limit the project to 128 MB
+#limit-as = 128
+
+# respawn processes after serving 5000 requests
+max-requests = 5000
+
+# background the process & log
+#daemonize = /var/log/uwsgi/gargantext.log
+
+uid = 1000
+gid = 1000
--- a/init/install/4-serverConfiguration.sh
+++ b/init/install/4-serverConfiguration.sh
+#!/bin/dash
+
+#
+
+echo "Copy nginx configuration in sites available"
+sudo cp 4-NGINX_gargantext.conf /etc/nginx/sites-available
+
+echo "Enable site"
+cd /etc/nginx/sites-enable
+sudo ln -s ../sites-available/gargantext.conf
+sudo service nginx restart
+
+echo "Copy UWSGI configuration"
+sudo cp 4-UWSGI_gargantext.ini /etc/uwsgi/
+sudo service uwsgi restart
--- a/init/install/5-configDjango.sh
+++ b/init/install/5-configDjango.sh
+#!/bin/bash
+
+git checkout stable
+source /srv/gargantext_env/bin/activate
+
+cd /srv/gargantext
+./manage.py collectstatic
+
+chown -R gargantua:www-data /var/www/gargantext
+
+
+
+
+
--- a/init/cte_tree.models.diff
+++ b/init/cte_tree.models.diff
--- a/init/post-install-fixes.README
+++ b/init/post-install-fixes.README
--- a/init/sql/changeDateformat.sql
+++ b/init/sql/changeDateformat.sql
+
+
+ALTER TABLE ONLY node_node ALTER COLUMN date SET DEFAULT CURRENT_DATE ;
+
--- a/init/sql/changeOwner.sh
+++ b/init/sql/changeOwner.sh
+
+for tbl in `psql -qAt -c "select tablename from pg_tables where schemaname = 'public';" gargandb` ; do  
+  psql -c "alter table $tbl owner to gargantua" gargandb ; 
+done
--- a/init/sql/hstore2jsonb.sql
+++ b/init/sql/hstore2jsonb.sql
+
+
+ALTER TABLE ONLY node_node ALTER COLUMN date SET DEFAULT CURRENT_DATE ;
+
+ALTER TABLE ONLY node_node ALTER COLUMN metadata DROP NOT NULL ;
+
+ALTER TABLE ONLY node_node ALTER COLUMN metadata DROP DEFAULT ;
+
+ALTER TABLE ONLY node_node ALTER COLUMN metadata TYPE JSONB USING hstore_to_json(metadata)::jsonb ;
+
+ALTER TABLE ONLY node_node ALTER COLUMN metadata SET DEFAULT '{}'::jsonb ;
+
+ALTER TABLE ONLY node_node ALTER COLUMN metadata SET NOT NULL ;
+
--- a/init/index.sql
+++ b/init/index.sql
--- a/init/init.sql
+++ b/init/init.sql
--- a/init/sql/init2.sql
+++ b/init/sql/init2.sql
+
+ALTER TABLE ONLY node_node ALTER COLUMN date SET DEFAULT CURRENT_DATE ;
+
+
--- a/node/models.py
+++ b/node/models.py
@@ -29,6 +29,7 @@ from celery import current_app
 import os
 import subprocess

+from parsing.parsers_config import parsers

 # Some usefull functions
 # TODO: start the function name with an underscore (private)
@@ -194,15 +195,19 @@ class Node(CTENode):
        print("= = = = = = = = =  = =\n")
        for node_resource in self.node_resource.filter(parsed=False):
            resource = node_resource.resource
-            parser = defaultdict(lambda:FileParser.FileParser, {
-                'istext'    : ISText,
-                'pubmed'    : PubmedFileParser,
-                'isi'       : IsiFileParser,
-                'ris'       : RisFileParser,
-                'europress' : EuropressFileParser,
-                'europress_french'  : EuropressFileParser,
-                'europress_english' : EuropressFileParser,
-            })[resource.type.name]()
+            parser = defaultdict(lambda:FileParser.FileParser, parsers
+#                    {
+#                'istext'            : ISText,
+#                'pubmed'            : PubmedFileParser,
+#                'isi'               : IsiFileParser,
+#                'ris'               : RisFileParser,
+#                'RIS (Jstor)'       : JstorFileParser,
+#                'europress'         : EuropressFileParser,
+#                'europress_french'  : EuropressFileParser,
+#                'europress_english' : EuropressFileParser,
+#            }
+                    
+                    )[resource.type.name]()
            metadata_list += parser.parse(str(resource.file))
        type_id = NodeType.objects.get(name='Document').id
        langages_cache = LanguagesCache()

--- a/parsing/FileParsers/IsiFileParser.py
+++ b/parsing/FileParsers/IsiFileParser.py
 from .RisFileParser import RisFileParser


+
 class IsiFileParser(RisFileParser):
    
    _parameters = {

--- a/parsing/FileParsers/JstorFileParser.py
+++ b/parsing/FileParsers/JstorFileParser.py
+from .RisFileParser import RisFileParser
+
+
+class JstorFileParser(RisFileParser):
+
+    _parameters = {
+        b"ER":  {"type": "delimiter"},
+        b"TI":  {"type": "metadata", "key": "title", "separator": " "},
+        b"AU":  {"type": "metadata", "key": "authors", "separator": ", "},
+        b"UR":  {"type": "metadata", "key": "doi"},
+        b"Y1":  {"type": "metadata", "key": "publication_year"},
+        b"PD":  {"type": "metadata", "key": "publication_month"},
+        b"LA":  {"type": "metadata", "key": "language_iso2"},
+        b"AB":  {"type": "metadata", "key": "abstract", "separator": " "},
+        b"WC":  {"type": "metadata", "key": "fields"},
+    }
+
+
--- a/parsing/FileParsers/__init__.py
+++ b/parsing/FileParsers/__init__.py
 from .RisFileParser import RisFileParser
 from .IsiFileParser import IsiFileParser
+from .JstorFileParser import JstorFileParser
 from .PubmedFileParser import PubmedFileParser
 from .EuropressFileParser import EuropressFileParser
 from .ISText import ISText
--- a/parsing/corpustools.py
+++ b/parsing/corpustools.py
@@ -7,8 +7,7 @@ from math import log

 from gargantext_web.db import *

-from .FileParsers import *
-
+from .parsers_config import parsers as _parsers


 class DebugTime:
@@ -31,18 +30,12 @@ class DebugTime:
 # keep all the parsers in a cache
 class Parsers(defaultdict):

-    _parsers = {
-        'pubmed'            : PubmedFileParser,
-        'istex'             : ISText,
-        'isi'               : IsiFileParser,
-        'ris'               : RisFileParser,
-        'europress'         : EuropressFileParser,
-        'europress_french'  : EuropressFileParser,
-        'europress_english' : EuropressFileParser,
-    }
+    def __init__(self):
+        self._parsers = _parsers

    def __missing__(self, key):
-        if key not in self._parsers:
+        #print(self._parsers.keys())
+        if key not in self._parsers.keys():
            raise NotImplementedError('No such parser: "%s"' % (key))
        parser = self._parsers[key]()
        self[key] = parser
@@ -238,11 +231,13 @@ def extract_ngrams(corpus, keys):
                    terms    = ' '.join([token for token, tag in ngram]).lower()
                    # TODO BUG here
                    if n == 1:
-                        tag_id   = cache.Tag[ngram[0][1]].id
-                        #tag_id   =  1
+                        #tag_id   = cache.Tag[ngram[0][1]].id
+                        tag_id   =  1
                        #print('tag_id', tag_id)
                    elif n > 1:
-                        tag_id   = cache.Tag['NN'].id
+                        tag_id   =  1
+                        #tag_id   = cache.Tag[ngram[0][1]].id
+                        #tag_id   = cache.Tag['NN'].id
                        #tag_id   =  14
                        #print('tag_id_2', tag_id)
                    node_ngram_list[node_id][terms] += 1

--- a/parsing/parsers_config.py
+++ b/parsing/parsers_config.py
+from .FileParsers import *
+
+parsers = {
+        'Pubmed (xml format)'               : PubmedFileParser,
+        'Web of Science (ISI format)'       : IsiFileParser,
+        'Scopus (RIS format)'               : RisFileParser,
+        'Zotero (RIS format)'               : RisFileParser,
+        'Jstor (RIS format)'                : JstorFileParser,
+        #'Europress'                        : EuropressFileParser,
+        'Europress (French)'                : EuropressFileParser,
+        'Europress (English)'               : EuropressFileParser,
+        
+    }
+
--- a/scrap_pubmed/views.py
+++ b/scrap_pubmed/views.py
@@ -34,13 +34,15 @@ from gargantext_web.api import JsonHttpResponse

 from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf

+from gargantext_web.celery import apply_workflow
+from time import sleep

 def getGlobalStats(request ):
 	print(request.method)
 	alist = ["bar","foo"]

 	if request.method == "POST":
-		N = 1000
+		N = 100
 		query = request.POST["query"]
 		print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
 		print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
@@ -81,9 +83,6 @@ def getGlobalStatsISTEXT(request ):
 def doTheQuery(request , project_id):
 	alist = ["hola","mundo"]

-	# SQLAlchemy session
-	session = Session()
-
 	# do we have a valid project id?
 	try:
 		project_id = int(project_id)
@@ -120,7 +119,7 @@ def doTheQuery(request , project_id):
 			urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
 		alist = ["tudo fixe" , "tudo bem"]

-		resourcetype = cache.ResourceType["pubmed"]
+		resourcetype = cache.ResourceType["Pubmed (xml format)"]

 		# corpus node instanciation as a Django model
 		corpus = Node(
@@ -129,6 +128,7 @@ def doTheQuery(request , project_id):
 			parent_id = project_id,
 			type_id = cache.NodeType['Corpus'].id,
 			language_id = None,
+                        metadata    = {'Processing' : 1,}
 		)
 		session.add(corpus)
 		session.commit()
@@ -165,18 +165,15 @@ def doTheQuery(request , project_id):
 		if dwnldsOK == 0: return JsonHttpResponse(["fail"])

 		try:
-			def apply_workflow(corpus):
-				parse_resources(corpus)
-				extract_ngrams(corpus, ['title'])
-				compute_tfidf(corpus)
-			if DEBUG:
-				apply_workflow(corpus)
+			if not DEBUG:
+				apply_workflow.apply_async((corpus.id,),)
 			else:
-				thread = threading.Thread(target=apply_workflow, args=(corpus, ), daemon=True)
+				thread = threading.Thread(target=apply_workflow, args=(corpus.id, ), daemon=True)
 				thread.start()
 		except Exception as error:
 			print('WORKFLOW ERROR')
 			print(error)
+		sleep(1)
 		return HttpResponseRedirect('/project/' + str(project_id))

 	data = alist

--- a/templates/project.html
+++ b/templates/project.html
@@ -83,11 +83,12 @@
 										<li>{{ key }}</li>
 												<ul>
 														{% for corpus in corpora %}
-														<li> {% ifnotequal corpus.count 0 %}
-																		<a href="/project/{{project.id}}/corpus/{{corpus.id}}">  {{corpus.name}} </a> , {{ corpus.count }} Documents 
-															 {% else %}
+														<li> 
+                               {% ifequal corpus.processing 1 %}
 																 	{{corpus.name}} : <img width="20px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img> Processing, drink a cup of tea, and refresh the page :)
-															 {% endifnotequal %}
+															 {% else %}
+																		<a href="/project/{{project.id}}/corpus/{{corpus.id}}">  {{corpus.name}} </a> , {{ corpus.count }} Documents 
+															 {% endifequal %}
 																		<button type="button" class="btn btn-xs btn-default" data-container="body" data-toggle="popover" data-placement="bottom" 
 																		data-content='
 																		<ul>
@@ -321,7 +322,7 @@
 			console.log("theType:")
 			console.log(theType)

-			if(theType=="pubmed") {
+			if(theType=="Pubmed (xml format)") {
 			    $.ajax({
 				  // contentType: "application/json",
 			      url: window.location.origin+"/tests/pubmedquery",
@@ -336,7 +337,7 @@
 					console.log("enabling "+"#"+value.id)
 					$("#"+value.id).attr('onclick','getGlobalResults(this);');
 					// $("#submit_thing").prop('disabled' , false)
-					$("#submit_thing").html("Process a 1000 sample!")
+					$("#submit_thing").html("Process a 100 sample!")

 		            thequeries = data
 		            var N=0,k=0;
@@ -425,7 +426,7 @@

 				$( "#id_name" ).on('input',function(e){
 					console.log($(this).val())
-					if(theType=="pubmed") testPUBMED( $(this).val() )
+					if(theType=="Pubmed (xml format)") testPUBMED( $(this).val() )
 				}); 
 			}
 		}
@@ -433,7 +434,7 @@
 		//CSS events for changing the Select element
 		function CustomForSelect( selected ) {
 			// show Radio-Inputs and trigger FileOrNotFile>@upload-file events
-			if(selected=="pubmed" || selected=="istex") {
+			if(selected=="Pubmed (xml format)" || selected=="istext") {
 			// if(selected=="pubmed") {
 				console.log("show the button for: "+selected)
 				$("#pubmedcrawl").css("visibility", "visible"); 

--- a/templates/subcorpus.html
+++ b/templates/subcorpus.html
@@ -19,18 +19,21 @@

 {% if documents %}

-
-          <div id="delAll" style="visibility: hidden;">
-              <button onclick="deleteDuplicates(theurl);">Delete Duplicates</button>
-          </div>
-
-
 <ul>
 {% for doc in documents %}
    {% if doc.date %}
    <li><div id="doc_{{doc.id}}"> <b>{{ doc.date }}</b>: <a target="_blank" href="/nodeinfo/{{doc.id}}">{{ doc.name}}</a> , @ {{ doc.metadata.source}}</div></li>
    {% endif %}
 {% endfor %}
+
+
+          <div id="delAll" style="visibility: hidden;">
+							<center>
+              <button onclick="deleteDuplicates(theurl);">Delete all Duplicates in one click</button>
+							</center>
+          </div>
+
+
 </ul>

 <script>