Commit 72ae16a8 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[MERGE]

parents 4dbe577c fe23f25f
......@@ -38,7 +38,9 @@ class HalCrawler(Crawler):
def _get(self, query, fromPage=1, count=10, lang=None):
# Parameters
fl = """ en_title_s
fl = """ docid
, title_s
, abstract_s
, en_title_s
, en_abstract_s
, submittedDate_s
......
......@@ -94,19 +94,30 @@ def query_list(list_id,
else:
# NB: score can be undefined (eg ex-subform that now became free)
# ==> we need outerjoin
# and the filter needs to have scoring_metric_id so we do it before
NNN = NodeNodeNgram
ScoresTable = (session
.query(NodeNodeNgram.score, NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.node1_id == scoring_metric_id)
.subquery()
)
query = (session
.query(Ngram.id, Ngram.terms, NNN.score)
# Ngrams must be related to our list <Node(id=list_id)>
.join(NodeNgram, (NodeNgram.ngram_id == Ngram.id) &
(NodeNgram.node_id == list_id))
# Select by metric <Node(id=scoring_metric_id)>
.outerjoin(NNN, (NNN.ngram_id == Ngram.id) &
(NNN.node1_id == scoring_metric_id))
# Sort by descending score
.order_by(NNN.score.desc())
.query(
NodeNgram.ngram_id,
Ngram.terms,
ScoresTable.c.score
)
.join(Ngram, NodeNgram.ngram_id == Ngram.id)
# main filter ----------------------
.filter(NodeNgram.node_id == list_id)
# scores if possible
.outerjoin(ScoresTable,
ScoresTable.c.ngram_id == NodeNgram.ngram_id)
.order_by(desc(ScoresTable.c.score))
)
if pagination_limit:
......
......@@ -15,11 +15,9 @@ class HalParser(Parser):
hyperdata_list = []
hyperdata_path = { "id" : "isbn_s"
, "title" : "title_s"
, "abstract" : "abstract_s"
, "title" : "en_title_s"
, "abstract" : "en_abstract_s"
hyperdata_path = { "id" : "docid"
, "title" : ["en_title_s", "title_s"]
, "abstract" : ["en_abstract_s", "abstract_s"]
, "source" : "journalTitle_s"
, "url" : "uri_s"
, "authors" : "authFullName_s"
......@@ -43,26 +41,29 @@ class HalParser(Parser):
for key, path in hyperdata_path.items():
field = doc.get(path, "NOT FOUND")
# A path can be a field name or a sequence of field names
if isinstance(path, (list, tuple)):
# Get first non-empty value of fields in path sequence, or None
field = next((x for x in (doc.get(p) for p in path) if x), None)
else:
# Get field value
field = doc.get(path)
if field is None:
field = "NOT FOUND"
if isinstance(field, list):
hyperdata[key] = ", ".join(map(lambda x: str(x), field))
hyperdata[key] = ", ".join(map(str, field))
else:
hyperdata[key] = str(field)
if hyperdata["url"] in uris:
print("Document already parsed")
else:
uris.add(hyperdata["url"])
# hyperdata["authors"] = ", ".join(
# [ p.get("person", {})
# .get("name" , "")
#
# for p in doc.get("hasauthor", [])
# ]
# )
#
maybeDate = doc.get("submittedDate_s", None)
maybeDate = doc.get("submittedDate_s", None)
if maybeDate is not None:
date = datetime.strptime(maybeDate, "%Y-%m-%d %H:%M:%S")
else:
......
......@@ -15,12 +15,16 @@ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
django.setup()
from gargantext.constants import QUERY_SIZE_N_MAX, get_resource, get_resource_by_name
from gargantext.models import ProjectNode, DocumentNode, UserNode, User
from gargantext.models import ProjectNode, DocumentNode
from gargantext.util.db import session, get_engine
from collections import Counter
import importlib
from django.http import Http404
# Import those to be available by notebook user
from langdetect import detect as detect_lang
from gargantext.models import UserNode, User
class NotebookError(Exception):
pass
......
......@@ -203,6 +203,7 @@
// do something…
resetStatusForm("#createForm");
})
return false;
})
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment