[MERGE]

72ae16a8 · Alexandre Delanoë · 4dbe577c · fe23f25f · 72ae16a8 · 72ae16a8
Commit 72ae16a8 authored Sep 12, 2017 by Alexandre Delanoë
5 changed files
--- a/gargantext/util/crawlers/HAL.py
+++ b/gargantext/util/crawlers/HAL.py
@@ -38,7 +38,9 @@ class HalCrawler(Crawler):
    def _get(self, query, fromPage=1, count=10, lang=None):
        # Parameters

-        fl = """ en_title_s
+        fl = """ docid
+               , title_s
+               , abstract_s
               , en_title_s
               , en_abstract_s
               , submittedDate_s

--- a/gargantext/util/ngramlists_tools.py
+++ b/gargantext/util/ngramlists_tools.py
@@ -94,19 +94,30 @@ def query_list(list_id,
    else:
        # NB: score can be undefined (eg ex-subform that now became free)
        #     ==> we need outerjoin
+        #     and the filter needs to have scoring_metric_id so we do it before

-        NNN = NodeNodeNgram
+        ScoresTable = (session
+                        .query(NodeNodeNgram.score, NodeNodeNgram.ngram_id)
+                        .filter(NodeNodeNgram.node1_id == scoring_metric_id)
+                        .subquery()
+                        )

        query = (session
-                    .query(Ngram.id, Ngram.terms, NNN.score)
-                    # Ngrams must be related to our list <Node(id=list_id)>
-                    .join(NodeNgram, (NodeNgram.ngram_id == Ngram.id) &
-                                     (NodeNgram.node_id == list_id))
-                    # Select by metric <Node(id=scoring_metric_id)>
-                    .outerjoin(NNN, (NNN.ngram_id == Ngram.id) &
-                                    (NNN.node1_id == scoring_metric_id))
-                    # Sort by descending score
-                    .order_by(NNN.score.desc())
+                    .query(
+                        NodeNgram.ngram_id,
+                        Ngram.terms,
+                        ScoresTable.c.score
+                     )
+                    .join(Ngram, NodeNgram.ngram_id == Ngram.id)
+
+                    # main filter ----------------------
+                    .filter(NodeNgram.node_id == list_id)
+
+                    # scores if possible
+                    .outerjoin(ScoresTable,
+                               ScoresTable.c.ngram_id == NodeNgram.ngram_id)
+
+                    .order_by(desc(ScoresTable.c.score))
                )

    if pagination_limit:

--- a/gargantext/util/parsers/HAL.py
+++ b/gargantext/util/parsers/HAL.py
@@ -15,11 +15,9 @@ class HalParser(Parser):

        hyperdata_list = []

-        hyperdata_path = { "id"              : "isbn_s"
-                         , "title"           : "title_s"
-                         , "abstract"        : "abstract_s"
-                         , "title"           : "en_title_s"
-                         , "abstract"        : "en_abstract_s"
+        hyperdata_path = { "id"              : "docid"
+                         , "title"           : ["en_title_s", "title_s"]
+                         , "abstract"        : ["en_abstract_s", "abstract_s"]
                         , "source"          : "journalTitle_s"
                         , "url"             : "uri_s"
                         , "authors"         : "authFullName_s"
@@ -43,26 +41,29 @@ class HalParser(Parser):

            for key, path in hyperdata_path.items():

-                    field = doc.get(path, "NOT FOUND")
+                # A path can be a field name or a sequence of field names
+                if isinstance(path, (list, tuple)):
+                    # Get first non-empty value of fields in path sequence, or None
+                    field = next((x for x in (doc.get(p) for p in path) if x), None)
+                else:
+                    # Get field value
+                    field = doc.get(path)
+
+                if field is None:
+                    field = "NOT FOUND"
+
                if isinstance(field, list):
-                        hyperdata[key] = ", ".join(map(lambda x: str(x), field))
+                    hyperdata[key] = ", ".join(map(str, field))
                else:
                    hyperdata[key] = str(field)

            if hyperdata["url"] in uris:
                print("Document already parsed")
+
            else:
                uris.add(hyperdata["url"])
-#            hyperdata["authors"] = ", ".join(
-#                                             [ p.get("person", {})
-#                                                .get("name"  , "")
-#                          
-#                                               for p in doc.get("hasauthor", [])
-#                                             ]
-#                                            )
-#            
-                maybeDate = doc.get("submittedDate_s", None)

+                maybeDate = doc.get("submittedDate_s", None)
                if maybeDate is not None:
                    date = datetime.strptime(maybeDate, "%Y-%m-%d %H:%M:%S")
                else:

--- a/install/notebook/gargantext_notebook.py
+++ b/install/notebook/gargantext_notebook.py
@@ -15,12 +15,16 @@ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
 django.setup()

 from gargantext.constants import QUERY_SIZE_N_MAX, get_resource, get_resource_by_name
-from gargantext.models import ProjectNode, DocumentNode, UserNode, User
+from gargantext.models import ProjectNode, DocumentNode
 from gargantext.util.db import session, get_engine
 from collections import Counter
 import importlib
 from django.http import Http404

+# Import those to be available by notebook user
+from langdetect import detect as detect_lang
+from gargantext.models import UserNode, User
+

 class NotebookError(Exception):
    pass

--- a/templates/pages/projects/overview.html
+++ b/templates/pages/projects/overview.html
@@ -203,6 +203,7 @@
      // do something…
        resetStatusForm("#createForm");
      })
+      return false;

    })