[FEAT] HAL Parser almost ok (some duplicates, check pages).

5de00550 · delanoe · a85e4c98 · 5de00550 · 5de00550 · 5de00550
Commit 5de00550 authored Apr 13, 2017 by delanoe
9 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -181,8 +181,6 @@ def get_tagger(lang):
    return tagger()
 RESOURCETYPES = [
    {   "type": 1,
        'name': 'Europresse',
@@ -264,6 +262,14 @@ RESOURCETYPES = [
        "crawler": "MultivacCrawler",
   },
+   {    "type": 11,
+        "name": 'HAL [CRAWLER]',
+        "parser": "HalParser",
+        "format": 'JSON',
+        'file_formats':["zip","json"],
+        "crawler": "HalCrawler",
+   },
 ]
 #shortcut for resources declaration in template
 PARSERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["parser"] is not None]

--- a/gargantext/urls.py
+++ b/gargantext/urls.py
@@ -28,19 +28,20 @@ import graph.urls
 import moissonneurs.urls
-urlpatterns = [ url(r'^admin/'     , admin.site.urls                           )
+urlpatterns = [ url(r'^admin/'     , admin.site.urls                                   )
-              , url(r'^api/'       , include( gargantext.views.api.urls )      )
+              , url(r'^api/'       , include( gargantext.views.api.urls )              )
-              , url(r'^'           , include( gargantext.views.pages.urls )    )
+              , url(r'^'           , include( gargantext.views.pages.urls )            )
              , url(r'^favicon.ico$', Redirect.as_view( url=static.url('favicon.ico')
-                                    , permanent=False), name="favicon")
+                                    , permanent=False), name="favicon"                 )
              # Module Graph
-              , url(r'^'           , include( graph.urls )                     )
+              , url(r'^'           , include( graph.urls )                             )
              # Module Annotation
              # tempo: unchanged doc-annotations routes --
-              , url(r'^annotations/', include( annotations_urls )              )
+              , url(r'^annotations/', include( annotations_urls )                      )
-              , url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/(focus=[0-9,]+)?$', annotations_main_view)
+              , url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/(focus=[0-9,]+)?$'
+                                                                , annotations_main_view)
              # Module Scrapers (Moissonneurs in French)
              , url(r'^moissonneurs/'   , include( moissonneurs.urls )                 )

--- a/gargantext/util/crawlers/HAL.py
+++ b/gargantext/util/crawlers/HAL.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# ****************************
+# ****  HAL      Scrapper  ***
+# ****************************
+# CNRS COPYRIGHTS
+# SEE LEGAL LICENCE OF GARGANTEXT.ORG
+from ._Crawler import *
+import json
+from gargantext.constants  import UPLOAD_DIRECTORY
+from math                  import trunc
+from gargantext.util.files import save
+class HalCrawler(Crawler):
+    ''' HAL API CLIENT'''
+    def __init__(self):
+        # Main EndPoints
+        self.BASE_URL = "https://api.archives-ouvertes.fr"
+        self.API_URL  = "search"
+        # Final EndPoints
+        # TODO : Change endpoint according type of database
+        self.URL   = self.BASE_URL + "/" + self.API_URL
+        self.status = []
+    def __format_query__(self, query=None):
+        '''formating the query'''
+        #search_field="title_t"
+        search_field="abstract_t"
+        return (search_field + ":" + "(" + query  + ")")
+    def _get(self, query, fromPage=1, count=10, lang=None):
+        # Parameters
+        fl = """ title_s
+               , abstract_s
+               , submittedDate_s
+               , journalDate_s
+               , authFullName_s
+               , uri_s
+               , isbn_s
+               , issue_s
+               , journalPublisher_s
+             """
+               #, authUrl_s
+               #, type_s
+        wt = "json"
+        querystring = { "q"       : query
+                      , "rows"    : count
+                      , "start"   : fromPage
+                      , "fl"      : fl
+                      , "wt"      : wt
+                      }
+        # Specify Headers
+        headers = { "cache-control" : "no-cache" }
+        # Do Request and get response
+        response = requests.request( "GET"
+                                   , self.URL
+                                   , headers = headers
+                                   , params  = querystring
+                                   )
+        #print(querystring)
+        # Validation : 200 if ok else raise Value
+        if response.status_code == 200:
+            charset = ( response.headers["Content-Type"]
+                                .split("; ")[1]
+                                .split("=" )[1]
+                      )
+            return (json.loads(response.content.decode(charset)))
+        else:
+            raise ValueError(response.status_code, response.reason)
+    def scan_results(self, query):
+        '''
+        scan_results : Returns the number of results
+        Query String -> Int
+        '''
+        self.results_nb = 0
+        total = ( self._get(query)
+                      .get("response", {})
+                      .get("numFound"  ,  0)
+                )
+        self.results_nb = total
+        return self.results_nb
+    def download(self, query):
+        downloaded = False
+        self.status.append("fetching results")
+        corpus = []
+        paging = 100
+        self.query_max = self.scan_results(query)
+        #print("self.query_max : %s" % self.query_max)
+        if self.query_max > QUERY_SIZE_N_MAX:
+            msg = "Invalid sample size N = %i (max = %i)" % ( self.query_max
+                                                            , QUERY_SIZE_N_MAX
+                                                            )
+            print("ERROR (scrap: Multivac d/l ): " , msg)
+            self.query_max = QUERY_SIZE_N_MAX
+        for page in range(1, trunc(self.query_max / 100) + 2):
+            print("Downloading page %s to %s results" % (page, paging))
+            docs = (self._get(query, fromPage=page, count=paging)
+                        .get("response", {})
+                        .get("docs"   , [])
+                   )
+            for doc in docs:
+                corpus.append(doc)
+        self.path = save( json.dumps(corpus).encode("utf-8")
+                        , name='HAL.json'
+                        , basedir=UPLOAD_DIRECTORY
+                        )
+        downloaded = True
+        return downloaded
--- a/gargantext/util/crawlers/MULTIVAC.py
+++ b/gargantext/util/crawlers/MULTIVAC.py
@@ -8,9 +8,9 @@
 from ._Crawler import *
 import json
-from gargantext.settings  import API_TOKENS
+from gargantext.settings   import API_TOKENS
-from gargantext.constants import UPLOAD_DIRECTORY
+from gargantext.constants  import UPLOAD_DIRECTORY
-from math import trunc
+from math                  import trunc
 from gargantext.util.files import save
 class MultivacCrawler(Crawler):
@@ -30,14 +30,7 @@ class MultivacCrawler(Crawler):
    def __format_query__(self, query=None):
        '''formating the query'''
+        None
-        if query is not None:
-            self.query = query
-            return self.query
-        else:
-            self.query = ""
-            return self.query
    def _get(self, query, fromPage=1, count=10, lang=None):
        # Parameters

--- a/gargantext/util/parsers/HAL.py
+++ b/gargantext/util/parsers/HAL.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# ****************************
+# ****  HAL Parser    ***
+# ****************************
+# CNRS COPYRIGHTS
+# SEE LEGAL LICENCE OF GARGANTEXT.ORG
+from ._Parser import Parser
+from datetime import datetime
+import json
+class HalParser(Parser):
+    def parse(self, filebuf):
+        '''
+        parse :: FileBuff -> [Hyperdata]
+        '''
+        contents = filebuf.read().decode("UTF-8")
+        data = json.loads(contents)
+        filebuf.close()
+        json_docs = data
+        hyperdata_list = []
+        hyperdata_path = { "id"       : "isbn_s"
+                         , "title"    : "title_s"
+                         , "abstract" : "abstract_s"
+                         , "source"   : "journalPublisher_s"
+                         , "url"      : "uri_s"
+                         , "authors"  : "authFullName_s"
+                         }
+        for doc in json_docs:
+            hyperdata = {}
+            for key, path in hyperdata_path.items():
+                    field = doc.get(path, "NOT FOUND")
+                    if isinstance(field, list):
+                        hyperdata[key] = ", ".join(field)
+                    else:
+                        hyperdata[key] = field
+#            hyperdata["authors"] = ", ".join(
+#                                             [ p.get("person", {})
+#                                                .get("name"  , "")
+#                          
+#                                               for p in doc.get("hasauthor", [])
+#                                             ]
+#                                            )
+#            
+            maybeDate = doc.get("submittedDate_s", None)
+            if maybeDate is not None:
+                date = datetime.strptime(maybeDate, "%Y-%m-%d %H:%M:%S")
+            else:
+                date = datetime.now()
+            hyperdata["publication_date"] = date
+            hyperdata["publication_year"]  = str(date.year)
+            hyperdata["publication_month"] = str(date.month)
+            hyperdata["publication_day"]   = str(date.day)
+            hyperdata_list.append(hyperdata)
+        return hyperdata_list
--- a/gargantext/util/parsers/MULTIVAC.py
+++ b/gargantext/util/parsers/MULTIVAC.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# ****************************
+# ****  MULTIVAC Parser    ***
+# ****************************
+# CNRS COPYRIGHTS
+# SEE LEGAL LICENCE OF GARGANTEXT.ORG
 from ._Parser import Parser
 from datetime import datetime
 import json
@@ -16,14 +24,11 @@ class MultivacParser(Parser):
        json_docs = data
        hyperdata_list = []
-        hyperdata_path = {
+        hyperdata_path = { "id"       : "id"
-            "id"                : "id",
+                         , "title"    : "title"
-            "title"             : "title",
+                         , "abstract" : "abstract"
-            "abstract"          : "abstract",
+                         , "type"     : "type"
-            "type"              : "type"
+                         }
-        }
-        suma = 0
        for json_doc in json_docs:

--- a/moissonneurs/hal.py
+++ b/moissonneurs/hal.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# ****************************
+# ***** HAL Crawler *****
+# ****************************
+# LICENCE: GARGANTEXT.org Licence
+RESOURCE_TYPE_HAL = 11
+from django.shortcuts               import redirect, render
+from django.http                    import Http404, HttpResponseRedirect \
+                                                  , HttpResponseForbidden
+from gargantext.constants           import get_resource, load_crawler, QUERY_SIZE_N_MAX
+from gargantext.models.nodes        import Node
+from gargantext.util.db             import session
+from gargantext.util.db_cache       import cache
+from gargantext.util.http           import JsonHttpResponse
+from gargantext.util.scheduling     import scheduled
+from gargantext.util.toolchain      import parse_extract_indexhyperdata
+def query( request):
+    '''get GlobalResults()'''
+    if request.method == "POST":
+        query = request.POST["query"]
+        source = get_resource(RESOURCE_TYPE_HAL)
+        if source["crawler"] is not None:
+            crawlerbot = load_crawler(source)()
+            #old raw way to get results_nb
+            results = crawlerbot.scan_results(query)
+            #ids = crawlerbot.get_ids(query)
+            print(results)
+            return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
+def save(request, project_id):
+    '''save'''
+    if request.method == "POST":
+        query = request.POST.get("query")
+        try:
+            N = int(request.POST.get("N"))
+        except:
+            N = 0
+        print(query, N)
+        #for next time
+        #ids = request.POST["ids"]
+        source = get_resource(RESOURCE_TYPE_HAL)
+        if N == 0:
+            raise Http404()
+        if N > QUERY_SIZE_N_MAX:
+            N = QUERY_SIZE_N_MAX
+        try:
+            project_id = int(project_id)
+        except ValueError:
+            raise Http404()
+        # do we have a valid project?
+        project = session.query( Node ).filter(Node.id == project_id).first()
+        if project is None:
+            raise Http404()
+        user = cache.User[request.user.id]
+        if not user.owns(project):
+            return HttpResponseForbidden()
+        # corpus node instanciation as a Django model
+        corpus = Node(
+            name = query,
+            user_id = request.user.id,
+            parent_id = project_id,
+            typename = 'CORPUS',
+                        hyperdata    = { "action"        : "Scrapping data"
+                                        }
+        )
+        #download_file
+        crawler_bot = load_crawler(source)()
+        #for now no way to force downloading X records
+        #the long running command
+        filename = crawler_bot.download(query)
+        corpus.add_resource(
+           type = source["type"]
+        #,  name = source["name"]
+        ,  path = crawler_bot.path
+                           )
+        session.add(corpus)
+        session.commit()
+        #corpus_id = corpus.id
+        try:
+            scheduled(parse_extract_indexhyperdata)(corpus.id)
+        except Exception as error:
+            print('WORKFLOW ERROR')
+            print(error)
+            try:
+                print_tb(error.__traceback__)
+            except:
+                pass
+            # IMPORTANT ---------------------------------
+            # sanitize session after interrupted transact
+            session.rollback()
+            # --------------------------------------------
+        return render(
+            template_name = 'pages/projects/wait.html',
+            request = request,
+            context = {
+                'user'   : request.user,
+                'project': project,
+            },
+        )
+    data = [query_string,query,N]
+    print(data)
+    return JsonHttpResponse(data)
--- a/moissonneurs/urls.py
+++ b/moissonneurs/urls.py
@@ -22,25 +22,27 @@ import moissonneurs.pubmed   as pubmed
 import moissonneurs.istex    as istex
 import moissonneurs.cern     as cern
 import moissonneurs.multivac as multivac
+import moissonneurs.hal      as hal
-# TODO
+# TODO : ISIDORE
-#import moissonneurs.hal         as hal
-#import moissonneurs.revuesOrg   as revuesOrg
-# TODO ?
-# REST API for the moissonneurs
 # /!\ urls patterns here are *without* the trailing slash
-urlpatterns = [ url(r'^pubmed/query$'     , pubmed.query     )
+urlpatterns = [ url(r'^pubmed/query$'       , pubmed.query   )
-              , url(r'^pubmed/save/(\d+)' , pubmed.save      )
+              , url(r'^pubmed/save/(\d+)'   , pubmed.save    )
-              , url(r'^istex/query$'      , istex.query      )
+              , url(r'^istex/query$'        , istex.query    )
-              , url(r'^istex/save/(\d+)'  , istex.save       )
+              , url(r'^istex/save/(\d+)'    , istex.save     )
-              , url(r'^cern/query$'       , cern.query       )
+              , url(r'^cern/query$'         , cern.query     )
-              , url(r'^cern/save/(\d+)'   , cern.save        )
+              , url(r'^cern/save/(\d+)'     , cern.save      )
              , url(r'^multivac/query$'     , multivac.query )
              , url(r'^multivac/save/(\d+)' , multivac.save  )
+              , url(r'^hal/query$'          , hal.query      )
+              , url(r'^hal/save/(\d+)'      , hal.save       )
+             #, url(r'^isidore/query$'      , isidore.query  )
+             #, url(r'^isidore/save/(\d+)'  , isidore.save   )
              ]
--- a/templates/pages/projects/project.html
+++ b/templates/pages/projects/project.html
@@ -325,11 +325,13 @@
                        <h2 class="modal-title"><h2><span class="glyphicon glyphicon-info-sign" aria-hidden="true"></span>  Building the corpus...</h2>
                      </div>
                      <div class="modal-body">
+                        <center>
                        <p>
-                        Gargantext is gathering your texts
+                        Gargantext is gathering your texts       <br>
-                         and need some time to eat it.
+                         and need some time to eat it.           <br>
                        Duration depends on the size of the dish.
                        </p>
+                        </center>
                      </div>
                      <div class="modal-footer">
                        <button type="button" class="btn btn-secondary" data-dismiss="modal">Continue on Gargantext</button>
@@ -440,9 +442,9 @@
                          var type = $("#id_type").val()
                          // 5 booleans
-                          var nameField = $("#id_name").val()!=""
+                          var nameField = $("#id_name").val() != ""
-                          var typeField = (type!="") && (type!="0")
+                          var typeField = (type != "") && (type != "0")
-                          var fileField = $("#id_file").val()!=""
+                          var fileField = $("#id_file").val() != ""
                          var wantfileField = $("#file_yes").prop("checked")
                          var crawling = ((type==3)||(type==8)||(type==9)) && ! wantfileField
@@ -705,6 +707,67 @@
                            }
+                            //HAL = 11
+                            if (SourceTypeId == "11"){
+                              $.ajax({
+                                  // contentType: "application/json",
+                                  url: window.location.origin+"/moissonneurs/hal/query",
+                                  data: formData,
+                                  type: 'POST',
+                                  beforeSend: function(xhr) {
+                                      xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
+                                  },
+                                  success: function(data) {
+                                      console.log(data)
+                                      console.log("SUCCESS")
+                                      console.log("enabling "+"#"+value.id)
+                                      // $("#"+value.id).attr('onclick','getGlobalResults(this);');
+                                      $("#submit_thing").prop('disabled' , false)
+                                      //$("#submit_thing").html("Process a {{ query_size }} sample!")
+                                      N = data["results_nb"]
+                                      if(N > 0) {
+                                          if (N <= {{query_size}}){
+                                            $("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
+                                            $("#submit_thing").html("Download!")
+                                            $("#submit_thing").prop('disabled' , false)
+                                            //$("#submit_thing").attr('onclick', testCERN(query, N));
+                                            $("#submit_thing").on("click", function(){
+                                              saveALL(pubmedquery, N);
+                                            //$("#submit_thing").onclick()
+                                          })}
+                                          //(N > {{query_size}})
+                                          else {
+                                            $("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
+                                            $('#submit_thing').prop('disabled', false);
+                                            $("#submit_thing").html("Processing a sample file")
+                                            $("#submit_thing").on("click", function(){
+                                              saveALL(pubmedquery, N);
+                                            //$("#submit_thing").onclick()
+                                          })}
+                                      }
+                                      else {
+                                          $("#theresults").html("<i>  <b>"+pubmedquery+"</b>: No results!.</i><br>")
+                                          if(data[0]==false)
+                                          $("#theresults").html(theType +" connection error!</i><br>")
+                                          $('#submit_thing').prop('disabled', true);
+                                      }
+                                  },
+                                  error: function(result) {
+                                      $("#theresults").html(theType +" connection error</i><br>")
+                                      $('#submit_thing').prop('disabled', true);
+                                  }
+                              });
+                            }
                        }
                        // CSS events for selecting one Radio-Input
@@ -751,7 +814,12 @@
                            console.log("selected:", selectedId);
                            // by typeID: 3 = PUBMED, 8 = ISTEX, 9 = CERN
-                            if(selectedId =="3" || selectedId == "8" || selectedId == "9" || selectedId == "10") {
+                            if (  selectedId == "3" 
+                               || selectedId == "8" 
+                               || selectedId == "9" 
+                               || selectedId == "10" 
+                               || selectedId == "11" 
+                                ) {
                                console.log("show the button for: " + selectedId)
                                $("#div-fileornot").css("visibility", "visible");
                                $("#div-fileornot").show();
@@ -933,6 +1001,55 @@
                              });
                          }
+                        function saveALL(query, N){
+                          console.log("In Gargantext")
+                          if(!query || query=="") return;
+                          console.log(query)
+                              //var origQuery = query
+                          var data = { "query" : query , "N": N };
+                          // Replace all the slashes
+                          var projectid = window.location.href.split("projects")[1].replace(/\//g, '')
+                          console.log(data)
+                          $.ajax({
+                              dataType: 'json',
+                              url: window.location.origin+"/moissonneurs/hal/save/"+projectid,
+                              data: data,
+                              type: 'POST',
+                              beforeSend: function(xhr) {
+                                  xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
+                              },
+                              success: function(data) {
+                                  console.log("ajax_success: in Gargantext()")
+                                  console.log(data)
+                                  alert("OK")
+                                  setTimeout(
+                                      function() {
+                                        $('#addcorpus').modal('hide')
+                                        $("#wait").modal("show");
+                                      }, 600);
+                                },
+                              error: function(data) {
+                                console.log(data)
+                                setTimeout(
+                                    function() {
+                                        $('#addcorpus').modal('hide')
+                                        $("#wait").modal("show")
+                                        //setTimeout(, 300)
+                                        //location.reload();
+                                      }, 600);
+                                  },
+                              });
+                          }