[MERGE] cern and unstable.

09535a2c · delanoe · 8285a652 · 5d7f8517 · 09535a2c · 09535a2c
Commit 09535a2c authored May 27, 2016 by delanoe
20 changed files
--- a/create_doc.sh
+++ b/create_doc.sh
 mkdocs build --clean
-mkdocs serve
+mkdocs serve --dev-addr=0.0.0.0:8888
--- a/docs/contribution.md
+++ b/docs/contribution.md
@@ -11,20 +11,16 @@
 ##Gargantex
 * Gargantex box install
+see [install procedure](install.md)
-(S.I.R.= Setup Install & Run procedures)
 * Architecture Overview
 * Database Schema Overview
 * Interface design Overview
 ##To do:
 * Docs
-* Interface deisgn
+* Interface design
-* Parsers/scrapers
+* [Parsers](./overview/parser.md) / scrappers(./overview/scraper.md)
 * Computing
 ## How to contribute:
@@ -35,22 +31,3 @@
    5.Test
    6. Commit
-### Exemple1: Adding a parser
-* create your new file cern.py into gargantex/scrapers/
-* reference into gargantex/scrapers/urls.py
-add this line:
-import scrapers.cern  as cern
-* reference into gargantext/constants
-```
-# type 9
-    {   'name': 'Cern',
-        'parser': CernParser,
-        'default_language': 'en',
-    },
-```
-* add an APIKEY in gargantex/settings
-### Exemple2: User Interface Design
--- a/docs/overview/parser.md
+++ b/docs/overview/parser.md
+# HOW TO: Reference a new webscrapper/API + parser
+## Global scope
+Three main mooves to do:
+- develop and index parser
+in gargantext.util.parsers
+- developp and index a scrapper
+in gargantext.moissonneurs
+- adapt forms for a new source
+in templates and views
+## Reference parser into gargantext website
+gargantext website is stored in gargantext/gargantext
+### reference your new parser into contants.py
+* import your parser l.125
+```
+from gargantext.util.parsers import \
+    EuropressParser, RISParser, PubmedParser, ISIParser, CSVParser, ISTexParser, CernParser
+```
+The parser corresponds to the name of the parser referenced in gargantext/util/parser
+here  name is CernParser
+* index your RESOURCETYPE
+int RESOURCETYPES (l.145) **at the end of the list**
+```
+# type 10
+   {    "name": 'SCOAP (XML MARC21 Format)',
+        "parser": CernParser,
+        "default_language": "en",
+        'accepted_formats':["zip","xml"],
+   },
+```
+    A noter le nom ici est composé de l'API_name(SCOAP) + (GENERICFILETYPE FORMAT_XML Format)
+    La complexité du nommage correspond à trois choses:
+        * le nom de l'API (different de l'organisme de production)
+        * le type de format: XML
+        * la norme XML de ce format : MARC21 (cf. CernParser in gargantext/util/parser/Cern.py )
+The default_langage corresponds to the default accepted lang that **should load** the default corresponding tagger
+```
+from gargantext.util.taggers import NltkTagger
+```
+    TO DO: charger à la demander les types de taggers en fonction des langues et de l'install
+    TO DO: proposer un module pour télécharger des parsers supplémentaires
+    TO DO: provide install tagger module scripts inside lib
+Les formats correspondent aux types de fichiers acceptées lors de l'envoi du fichier dans le formulaire de
+parsing disponible dans `gargantext/view/pages/projects.py` et
+exposé dans `/templates/pages/projects/project.html`
+## reference your parser script
+## add your parser script into folder gargantext/util/parser/
+here my filename was Cern.py
+##declare it into gargantext/util/parser/__init__.py
+from .Cern  import CernParser
+At this step, you will be able to see your parser and add a file with the form
+but nothing will occur
+## the good way to write the scrapper script
+Three main and only requirements:
+* your parser class should inherit from the base class _Parser()
+`gargantext/gargantext/util/parser/_Parser`
+* your parser class must have a parse method that take a **file buffer** as input
+* you parser must structure and store data into **hyperdata_list** variable name
+to be properly indexed by toolchain
+! Be careful of date format: provide a publication_date in  a string format YYYY-mm-dd HH:MM:SS
+# Adding a scrapper API to offer search option:
+En cours
+* Add pop up question Do you have a corpus
+option search in /templates/pages/projects/project.html line 181
+## Reference a scrapper (moissonneur) into gargantext
+* adding accepted_formats in constants
+* adding check_file routine in Form check ==> but should inherit from utils/files.py
+that also have implmented the size upload limit check
+# Suggestion 4 next steps:
+* XML parser MARC21 UNIMARC ...
+* A project type is qualified by the first element add i.e:
+the first element determine the type of corpus of all the corpora within the project
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -112,104 +112,116 @@ INDEXED_HYPERDATA = {
 }
-from gargantext.util.taggers import FrenchMeltTagger, TurboTagger
+#from gargantext.util.taggers import FrenchMeltTagger, TurboTagger
+from gargantext.util.taggers import NltkTagger
 LANGUAGES = {
    'en': {
        #'tagger': EnglishMeltTagger,
-        'tagger': TurboTagger,
+        #'tagger': TurboTagger,
-        #'tagger': NltkTagger,
+        'tagger': NltkTagger,
    },
    'fr': {
-        'tagger': FrenchMeltTagger,
+        #'tagger': FrenchMeltTagger,
        # 'tagger': TreeTagger,
+        'tagger': NltkTagger,
    },
 }
 from gargantext.util.parsers import \
-    EuropressParser, RISParser, PubmedParser, ISIParser, CSVParser, ISTexParser
+    EuropressParser, RISParser, PubmedParser, ISIParser, CSVParser, ISTexParser, CernParser
-def resourcetype(name):
+#from gargantext.util.scrappers import \
-    '''
+#    CernScraper
-    resourcetype :: String -> Int
-    Usage : resourcetype("Europress (English)") == 1
-    Examples in scrapers scripts (Pubmed or ISTex for instance).
+def get_resource(corpus_type):
-    '''
+    '''get ressources values for a given ressource_type id'''
-    return [n[0]  for n in enumerate(r['name'] for r in RESOURCETYPES) if n[1] == name][0]
+    for n in RESOURCETYPES:
+        if n["type"] == corpus_type:
-def resourcename(corpus):
+            return n
-    '''
-    resourcetype :: Corpus -> String
-    Usage : resourcename(corpus) == "ISTex"
-    '''
-    resource = corpus.resources()[0]
-    resourcename = RESOURCETYPES[resource['type']]['name']
-    return re.sub(r'\(.*', '', resourcename)
 RESOURCETYPES = [
    # type 0
-    {   'name': 'Select database below',
+    {   'type':0,
+        'name': 'Select database below',
        'parser': None,
        'default_language': None,
    },
    # type 1
-    {   'name': 'Europress (English)',
+    {   'type':1,
+        'name': 'Europress (English)',
        'parser': EuropressParser,
        'default_language': 'en',
+        'accepted_formats':["zip",],
    },
    # type 2
-    {   'name': 'Europress (French)',
+    {   'type':2,
+        'name': 'Europress (French)',
        'parser': EuropressParser,
        'default_language': 'fr',
+        'accepted_formats':["zip",],
    },
    # type 3
-    {   'name': 'Jstor (RIS format)',
+    {   'type':3,
+        'name': 'Jstor (RIS format)',
        'parser': RISParser,
        'default_language': 'en',
+        'accepted_formats':["zip",],
    },
    # type 4
-    {   'name': 'Pubmed (XML format)',
+    {   'type':4,
+        'name': 'Pubmed (XML format)',
        'parser': PubmedParser,
        'default_language': 'en',
+        'accepted_formats':["zip",],
    },
    # type 5
-    {   'name': 'Scopus (RIS format)',
+    {   'type':5,
+        'name': 'Scopus (RIS format)',
        'parser': RISParser,
        'default_language': 'en',
+        'accepted_formats':["zip",],
    },
    # type 6
-    {   'name': 'Web of Science (ISI format)',
+    {   'type': 6,
+        'name': 'Web of Science (ISI format)',
        'parser': ISIParser,
        'default_language': 'en',
+        'accepted_formats':["zip",],
    },
    # type 7
-    {   'name': 'Zotero (RIS format)',
+    {   'type':7,
+        'name': 'Zotero (RIS format)',
        'parser': RISParser,
        'default_language': 'en',
+        'accepted_formats':["zip",],
    },
    # type 8
-    {   'name': 'CSV',
+    {   'type':8,
+        'name': 'CSV',
        'parser': CSVParser,
        'default_language': 'en',
+        'accepted_formats':["csv"],
    },
    # type 9
-    {   'name': 'ISTex',
+    {   "type":9,
+        'name': 'ISTex',
        'parser': ISTexParser,
        'default_language': 'en',
+        'accepted_formats':["zip",],
    },
-    # type 10
+   {    "type":10,
-    {    "type":10,
+        "name": 'SCOAP (XML MARC21 Format)',
-         "name": 'SCOAP (XML MARC21 Format)',
+        "parser": CernParser,
-         "parser": CernParser,
+        "default_language": "en",
-         "default_language": "en",
+        'accepted_formats':["zip","xml"],
-         'accepted_formats':["zip","xml"],
+        #~ "scrapper": CernScrapper,
-         #~ "scrapper": CernScrapper,
+        #~ "base_url": "http://api.scoap3.org/search?",
-         #~ "base_url": "http://api.scoap3.org/search?",
+   },
-    },
 ]
 # linguistic extraction parameters ---------------------------------------------
 DEFAULT_RANK_CUTOFF_RATIO      = .75         # MAINLIST maximum terms in %
@@ -234,8 +246,8 @@ DEFAULT_ALL_LOWERCASE_FLAG      = True       # lowercase ngrams before recording
                                             #  occurring at sentence beginning)
 # ------------------------------------------------------------------------------
 # other parameters
 # default number of docs POSTed to scrappers.views.py
 #  (at page  project > add a corpus > scan/process sample)
 QUERY_SIZE_N_DEFAULT = 1000
@@ -245,7 +257,7 @@ from .settings import BASE_DIR
 # uploads/.gitignore prevents corpora indexing
 # copora can be either a folder or symlink towards specific partition
 UPLOAD_DIRECTORY   = os.path.join(BASE_DIR, 'uploads/corpora')
-UPLOAD_LIMIT       = 1024 * 1024 * 1024
+UPLOAD_LIMIT       = 1024* 1024 * 1024
 DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY

--- a/gargantext/models/nodes.py
+++ b/gargantext/models/nodes.py
@@ -110,6 +110,7 @@ class Node(Base):
        if order is not None:
            query = query.order_by(Node.name)
        return query
    def add_child(self, **kwargs):
@@ -135,7 +136,7 @@ class Node(Base):
            self['resources'] = MutableList()
        return self['resources']
-    def add_resource(self, type, path=None, url=None):
+    def add_resource(self, type, path=None, url=None, **kwargs):
        """Attach a resource to a given node.
        Mainly used for corpora.
@@ -145,10 +146,13 @@ class Node(Base):
        {'extracted': True,
          'path': '/home/me/gargantext/uploads/corpora/0c/0c5b/0c5b50/0c5b50ad8ebdeb2ae33d8e54141a52ee_Corpus_Europresse-Français-2015-12-11.zip',
          'type': 1,
-          'url': None}
+          'url': None,
+          'status':
+          'status_message':
+          }
        """
        self.resources().append(MutableDict(
-            {'type': type, 'path':path, 'url':url, 'extracted': False}
+            {'type': type, 'path':path, 'url':url, 'extracted': False, **kwargs}
        ))
    def status(self, action=None, progress=0, complete=False, error=None):

--- a/gargantext/requirements.pip
+++ b/gargantext/requirements.pip
+amqp==1.4.9
+anyjson==0.3.3
+beautifulsoup4==4.4.1
+billiard==3.3.0.22
+celery==3.1.20
+chardet==2.3.0
+dateparser==0.3.2
+decorator==4.0.9
+Django==1.9.2
+django-celery==3.1.17
+django-pgfields==1.4.4
+django-pgjsonb==0.0.16
+djangorestframework==3.3.2
+html5lib==0.9999999
+jdatetime==1.7.2
+kombu==3.0.33
+lxml==3.5.0
+networkx==1.11
+nltk==3.1
+numpy==1.10.4
+pandas==0.18.0
+pkg-resources==0.0.0
+psycopg2==2.6.1
+pycountry==1.20
+python-dateutil==2.4.2
+pytz==2015.7
+PyYAML==3.11
+RandomWords==0.1.12
+requests==2.10.0
+six==1.10.0
+SQLAlchemy==1.1.0b1.dev0
+ujson==1.35
+umalqurra==0.2
--- a/gargantext/util/files.py
+++ b/gargantext/util/files.py
@@ -23,13 +23,21 @@ def download(url, name=''):
        basedir = DOWNLOAD_DIRECTORY,
    )
+def check_format(corpus_type, name):
+    #~ if True:
+    acc_formats = RESOURCETYPES[corpus_type]["accepted_formats"]
+    if name.split(".")[-1].lower() not in acc_formats:
+        raise TypeError('Uncorrect format of file. File must be a %s file' %" or ".join(acc_formats))
 def upload(uploaded):
    if uploaded.size > UPLOAD_LIMIT:
        raise IOError('Uploaded file is bigger than allowed: %d > %d' % (
            uploaded.size,
            UPLOAD_LIMIT,
        ))
    return save(
        contents = uploaded.file.read(),
        name = uploaded.name,

--- a/gargantext/util/http.py
+++ b/gargantext/util/http.py
@@ -16,6 +16,7 @@ def requires_auth(func):
    Also passes the URL to redirect towards as a GET parameter.
    """
    def _requires_auth(request, *args, **kwargs):
+        #print(request.user.is_authenticated())
        if not request.user.is_authenticated():
            url = '/auth/login/?next=%s' % urlencode(request.path)
            return redirect(url)

--- a/gargantext/util/parsers/Cern.py
+++ b/gargantext/util/parsers/Cern.py
+from ._Parser import Parser
+from datetime import datetime
+from bs4 import BeautifulSoup
+from lxml import etree
+class CernParser(Parser):
+    #mapping MARC21 ==> hyperdata
+    MARC21 = {
+            #here main author
+            "100":{
+                    "a": "authors",
+                    "v": "authors_affiliations",
+                    "w": "authors_countries",
+                    "m": "authors_mails",
+                    },
+            #here cooauthor mais rappatrié vers la list  l'auteur avec main author [0]
+            "700": {
+                    "a": "authors",
+                    "v": "authors_affiliations",
+                    "w": "authors_countries",
+                    },
+            "773":{
+                    "c": "pages",
+                    "n": "issue",
+                    "p": "journal",
+                    "v": "volume",
+                    "y": "publication_year"
+                    },
+            "024": {"a":"doi"},
+            #"037": {"a":"arxiv"},
+            #"022": {"a":"isbn"},
+            "245": {"a":"title"},
+            "520": {"a":"abstract"},
+            "260": {"b":"publisher","c":"publication_date"},
+            #"024": {"t":"realdate_full_"}, #correspond to query date
+            #"540": {"a":"licence"},
+            #"653": {"a":"keywords"},
+            "856": {"u":"pdf_source"},
+            }
+    def format_date(self, hyperdata):
+        '''formatting pubdate'''
+        prefix = "publication"
+        date = datetime.strptime(hyperdata[prefix + "_date"], "%Y-%m-%d")
+        #hyperdata[prefix + "_year"]      = date.strftime('%Y')
+        hyperdata[prefix + "_month"]     = date.strftime("%m")
+        hyperdata[prefix + "_day"]       = date.strftime("%d")
+        hyperdata[prefix + "_hour"]      = date.strftime("%H")
+        hyperdata[prefix + "_minute"]    = date.strftime("%M")
+        hyperdata[prefix + "_second"]    = date.strftime("%S")
+        hyperdata[prefix + "_date"]  = date.strftime("%Y-%m-%d %H:%M:%S")
+        print("Date", hyperdata["publication_date"])
+        return hyperdata
+    def parse(self, file):
+        hyperdata_list = []
+        doc = file.read()
+        soup = BeautifulSoup(doc.decode("utf-8"), "lxml")
+        for record in soup.find_all("record"):
+            hyperdata = {v:[] for v in self.MARC21["100"].values()}
+            hyperdata["uid"] = soup.find("controlfield").text
+            hyperdata["language_iso2"] = "en"
+            for data in soup.find_all("datafield"):
+                tag = data.get("tag")
+                if tag in self.MARC21.keys():
+                    for sub in data.find_all("subfield"):
+                        code = sub.get("code")
+                        if code in self.MARC21[tag].keys():
+                            if tag == "100":
+                                try:
+                                    hyperdata[self.MARC21["100"][code]].insert(0,sub.text)
+                                except AttributeError:
+                                    hyperdata[self.MARC21["100"][code]] = [sub.text]
+                                #print ("1", self.MARC21["100"][code], hyperdata[self.MARC21["100"][code]])
+                            elif tag == "700":
+                                #print ("7", self.MARC21["100"][code], hyperdata[self.MARC21["100"][code]])
+                                try:
+                                    hyperdata[self.MARC21["100"][code]].append(sub.text)
+                                except AttributeError:
+                                    hyperdata[self.MARC21["100"][code]] = [sub.text]
+                            else:
+                                hyperdata[self.MARC21[tag][code]] = sub.text
+            hyperdata["authors_countries"] = (",").join(hyperdata["authors_countries"])
+            hyperdata["authors_affiliations"] = (",").join(hyperdata["authors_affiliations"])
+            hyperdata["authors"] = (",").join(hyperdata["authors"])
+            hyperdata["authors_mails"] = (",").join(hyperdata["authors_mails"])
+            hyperdata = self.format_date(hyperdata)
+            hyperdata_list.append(hyperdata)
+        return hyperdata_list
--- a/gargantext/util/parsers/_Parser.py
+++ b/gargantext/util/parsers/_Parser.py
@@ -2,7 +2,6 @@ import datetime
 import dateutil.parser
 import zipfile
 import re
 import dateparser as date_parser
 from gargantext.util.languages import languages
@@ -23,6 +22,13 @@ class Parser:
    def __del__(self):
        self._file.close()
+    def detect_format(self, afile, a_formats):
+        #import magic
+        print("Detecting format")
+        #print(magic.from_file(afile))
+        return
    def detect_encoding(self, string):
        """Useful method to detect the encoding of a document.
        """
@@ -107,10 +113,10 @@ class Parser:
            hyperdata[prefix + "_hour"]      = date.strftime("%H")
            hyperdata[prefix + "_minute"]    = date.strftime("%M")
            hyperdata[prefix + "_second"]    = date.strftime("%S")
+        print(hyperdata['publication_date'])
        # finally, return the transformed result!
        return hyperdata
-        print(hyperdata['publication_date'])
    def format_hyperdata_languages(self, hyperdata):
        """format the languages found in the hyperdata."""

--- a/gargantext/util/parsers/__init__.py
+++ b/gargantext/util/parsers/__init__.py
@@ -9,4 +9,4 @@ from .Europress import EuropressParser
 from .ISTex import ISTexParser
 from .CSV import CSVParser
-#from .CERN  import CernParser
+from .Cern  import CernParser
--- a/gargantext/util/taggers/__init__.py
+++ b/gargantext/util/taggers/__init__.py
-from .TurboTagger import TurboTagger
+#from .TurboTagger import TurboTagger
 from .NltkTagger import NltkTagger
-from .TreeTagger import TreeTagger
+#from .TreeTagger import TreeTagger
-from .MeltTagger import EnglishMeltTagger, FrenchMeltTagger
+#from .MeltTagger import EnglishMeltTagger, FrenchMeltTagger
--- a/gargantext/views/pages/projects.py
+++ b/gargantext/views/pages/projects.py
@@ -2,11 +2,12 @@ from gargantext.util.http import *
 from gargantext.util.db import *
 from gargantext.util.db_cache import cache
 from gargantext.util.files import upload
+from gargantext.util.files import check_format
 from gargantext.models import *
 from gargantext.constants import *
 from gargantext.util.scheduling import scheduled
 from gargantext.util.toolchain import parse_extract_indexhyperdata
+from gargantext.util.toolchain import add_corpus
 from datetime import datetime
 from collections import defaultdict
@@ -17,7 +18,7 @@ import re
 @requires_auth
 def overview(request):
    '''This view show all projects for a given user.
-    Each project is described with hyperdata that are updateded on each following view.
+    Each project is described with hyperdata that are updated on each following view.
    To each project, we can link a resource that can be an image.
    '''
@@ -59,17 +60,25 @@ def overview(request):
 class NewCorpusForm(forms.Form):
+    '''OK: add corpus Form (NIY)'''
    type = forms.ChoiceField(
        choices = enumerate(resource_type['name'] for resource_type in RESOURCETYPES),
        widget = forms.Select(attrs={ 'onchange' :'CustomForSelect( $("option:selected", this).text() );'})
    )
    name = forms.CharField( label='Name', max_length=199 , widget=forms.TextInput(attrs={ 'required': 'true' }))
    file = forms.FileField()
    def clean_file(self):
        file_ = self.cleaned_data.get('file')
-        if len(file_) > 1024 ** 3 : # we don't accept more than 1GB
+        if len(file_) > UPLOAD_LIMIT : # we don't accept more than 1GB
            raise forms.ValidationError(ugettext_lazy('File too heavy! (>1GB).'))
        return file_
+    def check_filename(self):
+        print(self.cleaned_data)
+        print (self.cleaned_data.get("file").split(".")[-1])
+        #if self.cleaned_data.get("file").split(".")[-1] not in RESSOURCETYPES[choices]
+        #print RESOURCETYPES[self.cleaned_data.get("
+        pass
 @requires_auth
@@ -83,61 +92,55 @@ def project(request, project_id):
    if not user.owns(project):
        raise HttpResponseForbidden()
-    # new corpus
+    # add a new corpus into Node Project > Node Corpus > Ressource
    if request.method == 'POST':
-        corpus = project.add_child(
-            name = request.POST['name'],
+        corpus = add_corpus(request, project)
-            typename = 'CORPUS',
-        )
+        if corpus.status:
-        corpus.add_resource(
+            # parse_extract: fileparsing -> ngram extraction -> lists
-            type = int(request.POST['type']),
+            scheduled(parse_extract_indexhyperdata)(corpus.id)
-            path = upload(request.FILES['file']),
+            return render(
-        )
+                template_name = 'pages/projects/wait.html',
-        session.add(corpus)
+                request = request,
-        session.commit()
+                context = {
-        # parse_extract: fileparsing -> ngram extraction -> lists
-        scheduled(parse_extract_indexhyperdata)(corpus.id)
-        return render(
-            template_name = 'pages/projects/wait.html',
-            request = request,
-            context = {
                'user'   : request.user,
                'project': project,
-            },
+                },
-        )
+            )
-    # corpora within this project
+    # list all the corpora within this project
    corpora = project.children('CORPUS', order=True).all()
+    #print(corpora)
    sourcename2corpora = defaultdict(list)
    for corpus in corpora:
        # we only consider the first resource of the corpus to determine its type
        resources = corpus.resources()
-        if len(resources):
+        if len(resources) > 0:
            resource = resources[0]
-            resource_type_name = RESOURCETYPES[resource['type']]['name']
+            resource= get_resource(resource["type"])
-        else:
+            ##here map from RESSOURCES_TYPES_ID and NOT NAME
-            print("(WARNING) PROJECT view: no listed resource")
+            resource_type_name = resource['name']
-        # add some data for the viewer
+            resource_type_accepted_formats = resource['accepted_formats']
-        corpus.count = corpus.children('DOCUMENT').count()
-        status = corpus.status()
+            # add some data for the viewer
-        if status is not None and not status['complete']:
+            corpus.count = corpus.children('DOCUMENT').count()
-            if not status['error']:
+            status = corpus.status()
-                corpus.status_message = '(in progress: %s, %d complete)' % (
+            if status is not None and not status['complete']:
-                    status['action'].replace('_', ' '),
+                if not status['error']:
-                    status['progress'],
+                    corpus.status_message = '(in progress: %s, %d complete)' % (
-                )
+                        status['action'].replace('_', ' '),
+                        status['progress'],
+                    )
+                else:
+                    corpus.status_message = '(aborted: "%s" after %i docs)' % (
+                        status['error'][-1],
+                        status['progress']
+                    )
            else:
-                corpus.status_message = '(aborted: "%s" after %i docs)' % (
+                corpus.status_message = ''
-                    status['error'][-1],
+            # add
-                    status['progress']
+            sourcename2corpora[resource_type_name].append(corpus)
-                )
-        else:
-            corpus.status_message = ''
-        # add
-        sourcename2corpora[resource_type_name].append(corpus)
    # source & their respective counts
    total_documentscount = 0
    sourcename2documentscount = defaultdict(int)

--- a/gargantext/views/pages/terms.py
+++ b/gargantext/views/pages/terms.py
@@ -2,7 +2,7 @@ from gargantext.util.http     import requires_auth, render, settings
 from gargantext.util.db       import session
 from gargantext.util.db_cache import cache
 from gargantext.models        import Node
-from gargantext.constants     import resourcename
+from gargantext.constants     import get_resource
 from datetime                 import datetime
 @requires_auth
@@ -32,7 +32,7 @@ def ngramtable(request, project_id, corpus_id):
            'date': datetime.now(),
            'project': project,
            'corpus' : corpus,
-            'resourcename' : resourcename(corpus),
+            'resourcename' : get_ressource(corpus)["name"],
            'view': 'terms'
        },
    )
--- a/install/python/requirements.txt
+++ b/install/python/requirements.txt
@@ -29,3 +29,5 @@ networkx==1.11
 pandas==0.18.0
 six==1.10.0
 lxml==3.5.0
+bs4==0.0.1
+requests==2.10.0
--- a/install/run.sh
+++ b/install/run.sh
@@ -8,6 +8,4 @@ su gargantua
 #activate the virtualenv
 source /srv/env_3-5/bin/activate
 #go to gargantext srv
-cd /srv/gargantext/
+cd /srv/gargantext/manage.py runserver 0.0.0.0:8000
-#run the server
-/manage.py runserver 0.0.0.0:8000
--- a/moissonneurs/cern.py
+++ b/moissonneurs/cern.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# ****************************
+# *****  CERN Scrapper    *****
+# ****************************
+import logging
+from logging.handlers import RotatingFileHandler
+# création de l'objet logger qui va nous servir à écrire dans les logs
+logger = logging.getLogger()
+# on met le niveau du logger à DEBUG, comme ça il écrit tout
+logger.setLevel(logging.DEBUG)
+# création d'un formateur qui va ajouter le temps, le niveau
+# de chaque message quand on écrira un message dans le log
+formatter = logging.Formatter('%(asctime)s :: %(levelname)s :: %(message)s')
+# création d'un handler qui va rediriger une écriture du log vers
+# un fichier en mode 'append', avec 1 backup et une taille max de 1Mo
+#>>> Permission denied entre en conflit avec les los django
+#file_handler = RotatingFileHandler('.activity.log', 'a', 1000000, 1)
+# on lui met le niveau sur DEBUG, on lui dit qu'il doit utiliser le formateur
+# créé précédement et on ajoute ce handler au logger
+#~ file_handler.setLevel(logging.DEBUG)
+#~ file_handler.setFormatter(formatter)
+#~ logger.addHandler(file_handler)
+# création d'un second handler qui va rediriger chaque écriture de log
+# sur la console
+steam_handler = logging.StreamHandler()
+steam_handler.setLevel(logging.DEBUG)
+logger.addHandler(steam_handler)
+import json
+import datetime
+from os import path
+import threading
+import hmac, hashlib
+import requests
+import lxml
+import subprocess
+import urllib.parse as uparse
+from lxml import etree
+from bs4 import BeautifulSoup, Comment
+from collections import defaultdict
+#from gargantext.util.files import download
+from gargantext.settings import API_TOKENS as API
+#from private import API_PERMISSIONS
+def save( request , project_id ) :
+    try:
+        project_id = int(project_id)
+    except ValueError:
+        raise Http404()
+    # do we have a valid project?
+    project = session.query( Node ).filter(Node.id == project_id).first()
+    if project is None:
+        raise Http404()
+    user = cache.User[request.user.id]
+    if not user.owns(project):
+        raise HttpResponseForbidden()
+    if request.method == "POST":
+        query = request.POST["query"]
+        name    = request.POST["string"]
+        corpus = project.add_child( name=name
+                                , typename = "CORPUS"
+                                  )
+        corpus.add_resource( type = resourcetype('Cern (MARC21 XML)')
+                                   , path = filename
+                                   , url  = None
+                                   )
+        print("Adding the resource")
+def query( request ):
+    print(request.method)
+    alist = []
+    if request.method == "POST":
+        query = request.POST["query"]
+        N = int(request.POST["N"])
+        if N > QUERY_SIZE_N_MAX:
+            msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
+            print("ERROR(scrap: pubmed stats): ",msg)
+            raise ValueError(msg)
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
+        #Here Requests API
+        #
+        #API_TOKEN = API["CERN"]
+        #instancia = Scraper()
+        # serialFetcher (n_last_years, query, query_size)
+        #alist = instancia.serialFetcher( 5, query , N )
+    data = alist
+    return JsonHttpResponse(data)
+class CERN_API(object):
+    '''CERN SCOAP3 Interaction'''
+    def __init__(self,query, filename= "./results.xml"):
+        self.query = query
+        self.apikey = API["TOKEN"]
+        self.secret  = API["SECRET"].encode("utf-8")
+        self.results = self.get_results(filename)
+        self.BASE_URL= u"http://api.scoap3.org/search?"
+    def __generate_signature__(self, url):
+        '''creation de la signature'''
+        #hmac-sha1 salted with secret
+        return hmac.new(self.secret,url, hashlib.sha1).hexdigest()
+    def __format_url__(self):
+        '''format the url with encoded query'''
+        dict_q = uparse.parse_qs(self.query)
+        #add the apikey
+        dict_q["apikey"] = [self.apikey]
+        params = "&".join([(str(k)+"="+str(uparse.quote(v[0]))) for k,v in sorted(dict_q.items())])
+        return self.BASE_URL+params
+    def sign_url(self):
+        '''add signature'''
+        url = self.__format_url__()
+        return url+"&signature="+self.__generate_signature__(url.encode("utf-8"))
+    def get_results(self, filename):
+        url = self.sign_url()
+        r = requests.get(url, stream=True)
+        with open(filename, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=1024):
+                if chunk: # filter out keep-alive new chunks
+                    f.write(chunk)
+        return filename
+    def parse_xml(filename,MARCXML):
+        parser = etree.XMLParser()
+        with open(self.filename, 'r') as f:
+            root = etree.tostring(f.read())
+            data = f.read()
+            records = []
+            for record in data.split("<record>")[1:]:
+                soup = BeautifulSoup("<record>"+record, "lxml")
+                r = {v:[] for v in self.MARC21["700"].values()}
+                r["uid"]  = soup.find("controlfield").text
+                for data in soup.find_all("datafield"):
+                    tag = data.get("tag")
+                    if tag in self.MARC21.keys():
+                        for sub in data.find_all("subfield"):
+                            code = sub.get("code")
+                            if code in self.MARC21[tag].keys():
+                                if tag == "700":
+                                    r[self.MARC21[tag][code]].append(sub.text)
+                                else:
+                                    r[self.MARC21[tag][code]] = sub.text
+                records.append(r.decode('utf-8'))
+        return JsonHttpResponse(records)
+#query="of=xm"
+#a = CERN_API(query, "./full.xml")
+#p = CERNParser("./full.xml")
+#print(p.MARC21.keys())
+#~ #p.parse()
+#~ with open("./results_full.json", "r") as f:
+    #~ data = json.load(f)
+    #~ for record in data["records"]:
+        #~ print(record.keys())
--- a/moissonneurs/istex.py
+++ b/moissonneurs/istex.py
@@ -8,7 +8,7 @@ from traceback                  import print_tb
 from django.shortcuts import redirect, render
 from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
-from gargantext.constants       import resourcetype, QUERY_SIZE_N_MAX
+from gargantext.constants       import QUERY_SIZE_N_MAX
 from gargantext.models.nodes    import Node
 from gargantext.util.db         import session
 from gargantext.util.http       import JsonHttpResponse
@@ -133,7 +133,7 @@ def save(request , project_id):
            if filename!=False:
                # add the uploaded resource to the corpus
                corpus.add_resource(
-                  type = resourcetype('ISTex')
+                  type = 9
                , path = filename
                                   )
                dwnldsOK+=1

--- a/moissonneurs/pubmed.py
+++ b/moissonneurs/pubmed.py
@@ -18,7 +18,7 @@ from traceback                  import print_tb
 from django.shortcuts import redirect
 from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
-from gargantext.constants       import resourcetype, QUERY_SIZE_N_MAX
+from gargantext.constants       import get_resource, QUERY_SIZE_N_MAX
 from gargantext.models.nodes    import Node
 from gargantext.util.db         import session
 from gargantext.util.db_cache   import cache
@@ -134,7 +134,7 @@ def save( request , project_id ) :
            print(filename)
            if filename != False:
                # add the uploaded resource to the corpus
-                corpus.add_resource( type = resourcetype('Pubmed (XML format)')
+                corpus.add_resource( type = 4
                                   , path = filename
                                   , url  = None
                                   )

--- a/moissonneurs/urls.py
+++ b/moissonneurs/urls.py
@@ -13,7 +13,7 @@
 # Available databases :
 ## Pubmed
 ## IsTex,
-## TODO CERN
+## En cours CERN
 from django.conf.urls import url
@@ -22,7 +22,7 @@ import moissonneurs.pubmed as pubmed
 import moissonneurs.istex  as istex
 # TODO
-#import moissonneurs.cern  as cern
+import moissonneurs.cern  as cern
 # TODO
 #import moissonneurs.hal         as hal
@@ -40,6 +40,6 @@ urlpatterns = [ url(r'^pubmed/query$'     , pubmed.query    )
              , url(r'^istex/save/(\d+)'  , istex.save      )
              # TODO
-              #, url(r'^scoap3/query$'      , cern.query       )
+              , url(r'^scoap3/query$'      , cern.query       )
-              #, url(r'^scoap3/save/(\d+)'  , cern.save        )
+              , url(r'^scoap3/save/(\d+)'  , cern.save        )
              ]