[MERGE]

72ae16a8 · Alexandre Delanoë · 4dbe577c · fe23f25f · 72ae16a8 · 72ae16a8
Commit 72ae16a8 authored Sep 12, 2017 by Alexandre Delanoë
5 changed files
--- a/gargantext/util/crawlers/HAL.py
+++ b/gargantext/util/crawlers/HAL.py
@@ -14,12 +14,12 @@ from gargantext.util.files import save
 class HalCrawler(Crawler):
    ''' HAL API CLIENT'''
    def __init__(self):
        # Main EndPoints
        self.BASE_URL = "https://api.archives-ouvertes.fr"
        self.API_URL  = "search"
        # Final EndPoints
        # TODO : Change endpoint according type of database
        self.URL   = self.BASE_URL + "/" + self.API_URL
@@ -38,7 +38,9 @@ class HalCrawler(Crawler):
    def _get(self, query, fromPage=1, count=10, lang=None):
        # Parameters
-        fl = """ en_title_s
+        fl = """ docid
+               , title_s
+               , abstract_s
               , en_title_s
               , en_abstract_s
               , submittedDate_s
@@ -59,7 +61,7 @@ class HalCrawler(Crawler):
             """
               #, authUrl_s
               #, type_s
        wt = "json"
        querystring = { "q"       : query
@@ -68,18 +70,18 @@ class HalCrawler(Crawler):
                      , "fl"      : fl
                      , "wt"      : wt
                      }
        # Specify Headers
        headers = { "cache-control" : "no-cache" }
        # Do Request and get response
        response = requests.request( "GET"
                                   , self.URL
                                   , headers = headers
                                   , params  = querystring
                                   )
        #print(querystring)
        # Validation : 200 if ok else raise Value
        if response.status_code == 200:
@@ -90,27 +92,27 @@ class HalCrawler(Crawler):
            return (json.loads(response.content.decode(charset)))
        else:
            raise ValueError(response.status_code, response.reason)
    def scan_results(self, query):
        '''
        scan_results : Returns the number of results
        Query String -> Int
        '''
        self.results_nb = 0
        total = ( self._get(query)
                      .get("response", {})
                      .get("numFound"  ,  0)
                )
        self.results_nb = total
        return self.results_nb
    def download(self, query):
        downloaded = False
        self.status.append("fetching results")
        corpus = []
@@ -124,7 +126,7 @@ class HalCrawler(Crawler):
                                                            )
            print("ERROR (scrap: HAL d/l ): " , msg)
            self.query_max = QUERY_SIZE_N_MAX
        #for page in range(1, trunc(self.query_max / 100) + 2):
        for page in range(0, self.query_max, paging):
            print("Downloading page %s to %s results" % (page, paging))
@@ -141,5 +143,5 @@ class HalCrawler(Crawler):
                        , basedir=UPLOAD_DIRECTORY
                        )
        downloaded = True
        return downloaded
--- a/gargantext/util/ngramlists_tools.py
+++ b/gargantext/util/ngramlists_tools.py
@@ -94,19 +94,30 @@ def query_list(list_id,
    else:
        # NB: score can be undefined (eg ex-subform that now became free)
        #     ==> we need outerjoin
+        #     and the filter needs to have scoring_metric_id so we do it before
-        NNN = NodeNodeNgram
+        ScoresTable = (session
+                        .query(NodeNodeNgram.score, NodeNodeNgram.ngram_id)
+                        .filter(NodeNodeNgram.node1_id == scoring_metric_id)
+                        .subquery()
+                        )
        query = (session
-                    .query(Ngram.id, Ngram.terms, NNN.score)
+                    .query(
-                    # Ngrams must be related to our list <Node(id=list_id)>
+                        NodeNgram.ngram_id,
-                    .join(NodeNgram, (NodeNgram.ngram_id == Ngram.id) &
+                        Ngram.terms,
-                                     (NodeNgram.node_id == list_id))
+                        ScoresTable.c.score
-                    # Select by metric <Node(id=scoring_metric_id)>
+                     )
-                    .outerjoin(NNN, (NNN.ngram_id == Ngram.id) &
+                    .join(Ngram, NodeNgram.ngram_id == Ngram.id)
-                                    (NNN.node1_id == scoring_metric_id))
-                    # Sort by descending score
+                    # main filter ----------------------
-                    .order_by(NNN.score.desc())
+                    .filter(NodeNgram.node_id == list_id)
+                    # scores if possible
+                    .outerjoin(ScoresTable,
+                               ScoresTable.c.ngram_id == NodeNgram.ngram_id)
+                    .order_by(desc(ScoresTable.c.score))
                )
    if pagination_limit:

--- a/gargantext/util/parsers/HAL.py
+++ b/gargantext/util/parsers/HAL.py
@@ -12,14 +12,12 @@ import json
 class HalParser(Parser):
    def _parse(self, json_docs):
        hyperdata_list = []
-        hyperdata_path = { "id"              : "isbn_s"
+        hyperdata_path = { "id"              : "docid"
-                         , "title"           : "title_s"
+                         , "title"           : ["en_title_s", "title_s"]
-                         , "abstract"        : "abstract_s"
+                         , "abstract"        : ["en_abstract_s", "abstract_s"]
-                         , "title"           : "en_title_s"
-                         , "abstract"        : "en_abstract_s"
                         , "source"          : "journalTitle_s"
                         , "url"             : "uri_s"
                         , "authors"         : "authFullName_s"
@@ -31,8 +29,8 @@ class HalParser(Parser):
                         , "instStructId_i"  : "instStructId_i"
                         , "deptStructId_i"  : "deptStructId_i"
                         , "labStructId_i"   : "labStructId_i"
-                         , "rteamStructId_i" : "rteamStructId_i" 
+                         , "rteamStructId_i" : "rteamStructId_i"
-                         , "docType_s"       : "docType_s" 
+                         , "docType_s"       : "docType_s"
                         }
        uris = set()
@@ -40,29 +38,32 @@ class HalParser(Parser):
        for doc in json_docs:
            hyperdata = {}
            for key, path in hyperdata_path.items():
-                    field = doc.get(path, "NOT FOUND")
+                # A path can be a field name or a sequence of field names
-                    if isinstance(field, list):
+                if isinstance(path, (list, tuple)):
-                        hyperdata[key] = ", ".join(map(lambda x: str(x), field))
+                    # Get first non-empty value of fields in path sequence, or None
-                    else:
+                    field = next((x for x in (doc.get(p) for p in path) if x), None)
-                        hyperdata[key] = str(field)
+                else:
+                    # Get field value
+                    field = doc.get(path)
+                if field is None:
+                    field = "NOT FOUND"
+                if isinstance(field, list):
+                    hyperdata[key] = ", ".join(map(str, field))
+                else:
+                    hyperdata[key] = str(field)
            if hyperdata["url"] in uris:
                print("Document already parsed")
            else:
                uris.add(hyperdata["url"])
-#            hyperdata["authors"] = ", ".join(
-#                                             [ p.get("person", {})
-#                                                .get("name"  , "")
-#                          
-#                                               for p in doc.get("hasauthor", [])
-#                                             ]
-#                                            )
-#            
-                maybeDate = doc.get("submittedDate_s", None)
+                maybeDate = doc.get("submittedDate_s", None)
                if maybeDate is not None:
                    date = datetime.strptime(maybeDate, "%Y-%m-%d %H:%M:%S")
                else:
@@ -72,9 +73,9 @@ class HalParser(Parser):
                hyperdata["publication_year"]  = str(date.year)
                hyperdata["publication_month"] = str(date.month)
                hyperdata["publication_day"]   = str(date.day)
                hyperdata_list.append(hyperdata)
        return hyperdata_list
    def parse(self, filebuf):

--- a/install/notebook/gargantext_notebook.py
+++ b/install/notebook/gargantext_notebook.py
@@ -15,12 +15,16 @@ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
 django.setup()
 from gargantext.constants import QUERY_SIZE_N_MAX, get_resource, get_resource_by_name
-from gargantext.models import ProjectNode, DocumentNode, UserNode, User
+from gargantext.models import ProjectNode, DocumentNode
 from gargantext.util.db import session, get_engine
 from collections import Counter
 import importlib
 from django.http import Http404
+# Import those to be available by notebook user
+from langdetect import detect as detect_lang
+from gargantext.models import UserNode, User
 class NotebookError(Exception):
    pass

--- a/templates/pages/projects/overview.html
+++ b/templates/pages/projects/overview.html
@@ -203,6 +203,7 @@
      // do something…
        resetStatusForm("#createForm");
      })
+      return false;
    })