[FEAT] Multivac/REPEC scan is ok. Needs to fix parser.

ba042fa0 · delanoe · 5a6c8acd · ba042fa0 · ba042fa0 · ba042fa0
Commit ba042fa0 authored Apr 11, 2017 by delanoe
10 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -248,13 +248,22 @@ RESOURCETYPES = [
        'file_formats':["zip","xml"],
        "crawler": "CernCrawler",
   },
+#   {    "type": 10,
+#        "name": 'REPEC [RIS]',
+#        "parser": "RISParser",
+#        "format": 'RIS',
+#        'file_formats':["zip","ris", "txt"],
+#        "crawler": None,
+#   },
+#
   {    "type": 10,
-        "name": 'REPEC [RIS]',
+        "name": 'REPEC [MULTIVAC]',
-        "parser": "RISParser",
+        "parser": "MultivacParser",
-        "format": 'RIS',
+        "format": 'JSON',
-        'file_formats':["zip","ris", "txt"],
+        'file_formats':["zip","json"],
-        "crawler": None,
+        "crawler": "MultivacCrawler",
   },
 ]
 #shortcut for resources declaration in template
 PARSERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["parser"] is not None]

--- a/gargantext/util/crawlers/CERN.py
+++ b/gargantext/util/crawlers/CERN.py
@@ -4,7 +4,7 @@
 # *****  CERN Scrapper    *****
 # ****************************
 # Author:c24b
-# Date: 27/05/2015
+# Date: 27/05/2016
 import hmac, hashlib
 import requests
 import os
@@ -96,10 +96,12 @@ class CernCrawler(Crawler):
        print(self.results_nb, "res")
        #self.generate_urls()
        return(self.ids)
    def generate_urls(self):
        ''' generate raw urls of ONE record'''
        self.urls = ["http://repo.scoap3.org/record/%i/export/xm?ln=en" %rid for rid in self.ids]
        return self.urls
    def fetch_records(self, ids):
        ''' for NEXT time'''
        raise NotImplementedError

--- a/gargantext/util/crawlers/ISTEX.py
+++ b/gargantext/util/crawlers/ISTEX.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# ****************************
+# ****  MULTIVAC Scrapper  ***
+# ****************************
+# CNRS COPYRIGHTS
+# SEE LEGAL LICENCE OF GARGANTEXT.ORG
 from ._Crawler import *
 import json

--- a/gargantext/util/crawlers/MULTIVAC.py
+++ b/gargantext/util/crawlers/MULTIVAC.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# ****************************
+# ****  MULTIVAC Scrapper  ***
+# ****************************
+# CNRS COPYRIGHTS
+# SEE LEGAL LICENCE OF GARGANTEXT.ORG
+from ._Crawler import *
+import json
+from gargantext.settings import API_TOKENS
+class MultivacCrawler(Crawler):
+    ''' Multivac API CLIENT'''
+    def __init__(self):
+        self.apikey = API_TOKENS["MULTIVAC"]
+        # Main EndPoints
+        self.BASE_URL = "https://api.iscpif.fr/v2"
+        self.API_URL  = "pvt/economy/repec/search"
+        # Final EndPoints
+        # TODO : Change endpoint according type of database
+        self.URL   = self.BASE_URL + "/" + self.API_URL
+    def __format_query__(self, query=None):
+        '''formating the query'''
+        if query is not None:
+            self.query = query
+            return self.query
+        else:
+            self.query = ""
+            return self.query
+    def _get(self, query, fromPage=1, count=10, lang=None):
+        # Parameters
+        querystring = { "q"       : query
+                      , "count"   : count
+                      , "from"    : fromPage
+                      , "api_key" : API_TOKENS["MULTIVAC"]["APIKEY"]
+                      }
+        if lang is not None:
+            querystring["lang"] = lang
+        # Specify Headers
+        headers = { "cache-control" : "no-cache" }
+        # Do Request and get response
+        response = requests.request( "GET"
+                                   , self.URL
+                                   , headers = headers
+                                   , params  = querystring
+                                   )
+        # Validation : 200 if ok else raise Value
+        if response.status_code == 200:
+            charset = response.headers["Content-Type"].split("; ")[1].split("=")[1]
+            return (json.loads(response.content.decode(charset)))
+        else:
+            raise ValueError(response.status_code, response.reason)
+    def scan_results(self, query):
+        '''
+        scan_results : Returns the number of results
+        Query String -> Int
+        '''
+        self.results_nb = 0
+        total = self._get(query)["results"]["total"]
+        self.results_nb = total
+        return self.results_nb
+    def download(self, query):
+        downloaded = False
+        self.status.append("fetching results")
+        corpus = []
+        paging = 100
+        self.query_max = self.results_nb
+        if self.query_max > QUERY_SIZE_N_MAX:
+            msg = "Invalid sample size N = %i (max = %i)" % (self.query_max, QUERY_SIZE_N_MAX)
+            print("ERROR (scrap: multivac d/l ): ",msg)
+            self.query_max = QUERY_SIZE_N_MAX
+        with open(self.path, 'wb') as f:
+            for page in range(0, self.query_max, paging):
+                corpus.append(self.get(self.query, fromPage=page, count=paging)["hits"])
+            f.write(str(corpus).encode("utf-8"))
+            downloaded = True
+        return downloaded
--- a/gargantext/util/crawlers/_Crawler.py
+++ b/gargantext/util/crawlers/_Crawler.py
 # Scrapers config
 QUERY_SIZE_N_MAX     = 1000
-from gargantext.constants import get_resource
+from gargantext.constants import get_resource, QUERY_SIZE_N_MAX
 from gargantext.util.scheduling import scheduled
 from gargantext.util.db         import session
 from requests_futures.sessions import FuturesSession
@@ -18,31 +18,34 @@ class Crawler:
        #the name of corpus
        #that will be built in case of internal fileparsing
-        self.record = record
+        self.record       = record
-        self.name = record["corpus_name"]
+        self.name         = record["corpus_name"]
-        self.project_id = record["project_id"]
+        self.project_id   = record["project_id"]
-        self.user_id = record["user_id"]
+        self.user_id      = record["user_id"]
-        self.resource = record["source"]
+        self.resource     = record["source"]
-        self.type = get_resource(self.resource)
+        self.type         = get_resource(self.resource)
-        self.query = record["query"]
+        self.query        = record["query"]
        #format the sampling
        self.n_last_years = 5
-        self.YEAR = date.today().year
+        self.YEAR         = date.today().year
        #pas glop
        # mais easy version
-        self.MONTH = str(date.today().month)
+        self.MONTH        = str(date.today().month)
        if len(self.MONTH) == 1:
            self.MONTH = "0"+self.MONTH
-        self.MAX_RESULTS = 1000
+        self.MAX_RESULTS = QUERY_SIZE_N_MAX
        try:
            self.results_nb = int(record["count"])
        except KeyError:
            #n'existe pas encore
            self.results_nb = 0
        try:
-            self.webEnv = record["webEnv"]
+            self.webEnv   = record["webEnv"]
            self.queryKey = record["queryKey"]
-            self.retMax = record["retMax"]
+            self.retMax   = record["retMax"]
        except KeyError:
            #n'exsite pas encore
            self.queryKey = None
@@ -67,6 +70,7 @@ class Crawler:
        if self.download():
            self.create_corpus()
            return self.corpus_id
    def get_sampling_dates():
        '''Create a sample list of min and max date based on Y and M f*
        or N_LAST_YEARS results'''

--- a/gargantext/util/parsers/ISTEX.py
+++ b/gargantext/util/parsers/ISTEX.py
@@ -13,20 +13,21 @@ class ISTexParser(Parser):
        hyperdata_list = []
        hyperdata_path = {
            "id"                : "id",
-            "source"           : 'corpusName',
+            "source"            : "corpusName",
-            "title"             : 'title',
+            "title"             : "title",
            "genre"             : "genre",
-            "language_iso3"     : 'language',
+            "language_iso3"     : "language",
-            "doi"               : 'doi',
+            "doi"               : "doi",
-            "host"              : 'host',
+            "host"              : "host",
-            "publication_date"  : 'publicationDate',
+            "publication_date"  : "publicationDate",
-            "abstract"  : 'abstract',
+            "abstract"          : "abstract",
            # "authors"           : 'author',
-            "authorsRAW"        : 'author',
+            "authorsRAW"        : "author",
            #"keywords"          : "keywords"
        }
        suma = 0
        for json_doc in json_docs:
            hyperdata = {}

--- a/moissonneurs/multivac.py
+++ b/moissonneurs/multivac.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# ****************************
+# ***** MULTIVAC Crawler *****
+# ****************************
+# LICENCE: GARGANTEXT.org Licence
+RESOURCE_TYPE_MULTIVAC = 10
+from django.shortcuts import redirect, render
+from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
+from gargantext.constants           import get_resource, load_crawler, QUERY_SIZE_N_MAX
+from gargantext.models.nodes        import Node
+from gargantext.util.db             import session
+from gargantext.util.db_cache       import cache
+from gargantext.util.http           import JsonHttpResponse
+from gargantext.util.scheduling     import scheduled
+from gargantext.util.toolchain      import parse_extract_indexhyperdata
+def query( request):
+    '''get GlobalResults()'''
+    if request.method == "POST":
+        query = request.POST["query"]
+        source = get_resource(RESOURCE_TYPE_MULTIVAC)
+        if source["crawler"] is not None:
+            crawlerbot = load_crawler(source)()
+            #old raw way to get results_nb
+            results = crawlerbot.scan_results(query)
+            #ids = crawlerbot.get_ids(query)
+            print(results)
+            return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
+def save(request, project_id):
+    '''save'''
+    if request.method == "POST":
+        query = request.POST.get("query")
+        try:
+            N = int(request.POST.get("N"))
+        except:
+            N = 0
+        print(query, N)
+        #for next time
+        #ids = request.POST["ids"]
+        source = get_resource(RESOURCE_TYPE_SCOAP)
+        if N == 0:
+            raise Http404()
+        if N > QUERY_SIZE_N_MAX:
+            N = QUERY_SIZE_N_MAX
+        try:
+            project_id = int(project_id)
+        except ValueError:
+            raise Http404()
+        # do we have a valid project?
+        project = session.query( Node ).filter(Node.id == project_id).first()
+        if project is None:
+            raise Http404()
+        user = cache.User[request.user.id]
+        if not user.owns(project):
+            return HttpResponseForbidden()
+        # corpus node instanciation as a Django model
+        corpus = Node(
+            name = query,
+            user_id = request.user.id,
+            parent_id = project_id,
+            typename = 'CORPUS',
+                        hyperdata    = { "action"        : "Scrapping data"
+                                        , "language_id" : "en"
+                                        }
+        )
+        #download_file
+        crawler_bot = load_crawler(source)()
+        #for now no way to force downloading X records
+        #the long running command
+        filename = crawler_bot.download(query)
+        corpus.add_resource(
+           type = source["type"]
+        #,  name = source["name"]
+        ,  path = crawler_bot.path
+                           )
+        session.add(corpus)
+        session.commit()
+        #corpus_id = corpus.id
+        try:
+            scheduled(parse_extract_indexhyperdata)(corpus.id)
+        except Exception as error:
+            print('WORKFLOW ERROR')
+            print(error)
+            try:
+                print_tb(error.__traceback__)
+            except:
+                pass
+            # IMPORTANT ---------------------------------
+            # sanitize session after interrupted transact
+            session.rollback()
+            # --------------------------------------------
+        return render(
+            template_name = 'pages/projects/wait.html',
+            request = request,
+            context = {
+                'user'   : request.user,
+                'project': project,
+            },
+        )
+    data = [query_string,query,N]
+    print(data)
+    return JsonHttpResponse(data)
--- a/moissonneurs/urls.py
+++ b/moissonneurs/urls.py
@@ -18,9 +18,10 @@
 from django.conf.urls import url
-import moissonneurs.pubmed as pubmed
+import moissonneurs.pubmed   as pubmed
-import moissonneurs.istex  as istex
+import moissonneurs.istex    as istex
-import moissonneurs.cern  as cern
+import moissonneurs.cern     as cern
+import moissonneurs.multivac as multivac
 # TODO
 #import moissonneurs.hal         as hal
@@ -31,11 +32,15 @@ import moissonneurs.cern  as cern
 # REST API for the moissonneurs
 # /!\ urls patterns here are *without* the trailing slash
-urlpatterns = [ url(r'^pubmed/query$'     , pubmed.query    )
+urlpatterns = [ url(r'^pubmed/query$'     , pubmed.query     )
-              , url(r'^pubmed/save/(\d+)' , pubmed.save     )
+              , url(r'^pubmed/save/(\d+)' , pubmed.save      )
-              , url(r'^istex/query$'      , istex.query     )
+              , url(r'^istex/query$'      , istex.query      )
-              , url(r'^istex/save/(\d+)'  , istex.save      )
+              , url(r'^istex/save/(\d+)'  , istex.save       )
-              , url(r'^cern/query$'      , cern.query       )
-              , url(r'^cern/save/(\d+)'  , cern.save        )
+              , url(r'^cern/query$'       , cern.query       )
+              , url(r'^cern/save/(\d+)'   , cern.save        )
+              , url(r'^multivac/query$'     , multivac.query )
+              , url(r'^multivac/save/(\d+)' , multivac.save  )
              ]
--- a/templates/pages/projects/moissonneurs.js
+++ b/templates/pages/projects/moissonneurs.js
@@ -209,9 +209,11 @@
  function CustomForSelect( selected ) {
      // show Radio-Inputs and trigger FileOrNotFile>@upload-file events
      selected = selected.toLowerCase()
-      var is_pubmed = (selected.indexOf('pubmed') != -1);
+      var is_pubmed = (selected.indexOf('pubmed')  != -1);
-      var is_istex = (selected.indexOf('istex') != -1);
+      var is_istex  = (selected.indexOf('istex' )  != -1);
-      if (is_pubmed || is_istex) {
+      var is_repec  = (selected.indexOf('repec' )  != -1);
+      if (is_pubmed || is_istex || is_repec) {
          // if(selected=="pubmed") {
          console.log("show the button for: " + selected)
          $("#pubmedcrawl").css("visibility", "visible");

--- a/templates/pages/projects/project.html
+++ b/templates/pages/projects/project.html
@@ -545,7 +545,7 @@
                                    },
                                    error: function(result) {
-                                        $("#theresults").html("Pubmed connection error!</i><br>")
+                                        $("#theresults").html("Pubmed connection error.</i><br>")
                                        $('#submit_thing').prop('disabled', true);
                                    }
                                });
@@ -643,6 +643,68 @@
                              });
                            }
+                            //MULTIVAC = 10
+                            if (SourceTypeId == "10"){
+                              $.ajax({
+                                  // contentType: "application/json",
+                                  url: window.location.origin+"/moissonneurs/multivac/query",
+                                  data: formData,
+                                  type: 'POST',
+                                  beforeSend: function(xhr) {
+                                      xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
+                                  },
+                                  success: function(data) {
+                                      console.log("SUCCESS")
+                                      console.log("enabling "+"#"+value.id)
+                                      $("#"+value.id).attr('onclick','getGlobalResults(this);');
+                                      $("#submit_thing").prop('disabled' , false)
+                                      //$("#submit_thing").html("Process a {{ query_size }} sample!")
+                                      N = data["results_nb"]
+                                      if(N > 0) {
+                                          if (N <= {{query_size}}){
+                                            $("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
+                                            $("#submit_thing").html("Download!")
+                                            $("#submit_thing").prop('disabled' , false)
+                                            //$("#submit_thing").attr('onclick', testCERN(query, N));
+                                            $("#submit_thing").on("click", function(){
+                                              testCERN(pubmedquery, N);
+                                            //$("#submit_thing").onclick()
+                                          })}
+                                          //(N > {{query_size}})
+                                          else {
+                                            $("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
+                                            $('#submit_thing').prop('disabled', false);
+                                            $("#submit_thing").html("Processing a sample file")
+                                            $("#submit_thing").on("click", function(){
+                                              testCERN(pubmedquery, N);
+                                            //$("#submit_thing").onclick()
+                                          })}
+                                      }
+                                      else {
+                                          $("#theresults").html("<i>  <b>"+pubmedquery+"</b>: No results!.</i><br>")
+                                          if(data[0]==false)
+                                          $("#theresults").html(theType +" connection error!</i><br>")
+                                          $('#submit_thing').prop('disabled', true);
+                                      }
+                                  },
+                                  error: function(result) {
+                                      $("#theresults").html(theType +" connection error!</i><br>")
+                                      $('#submit_thing').prop('disabled', true);
+                                  }
+                              });
+                            }
                        }
                        // CSS events for selecting one Radio-Input
@@ -689,7 +751,7 @@
                            console.log("selected:", selectedId);
                            // by typeID: 3 = PUBMED, 8 = ISTEX, 9 = CERN
-                            if(selectedId =="3" || selectedId == "8" || selectedId == "9") {
+                            if(selectedId =="3" || selectedId == "8" || selectedId == "9" || selectedId == "10") {
                                console.log("show the button for: " + selectedId)
                                $("#div-fileornot").css("visibility", "visible");
                                $("#div-fileornot").show();