Commit e7ac6426 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge remote-tracking branch 'gargantext.org/simon-unstable-notebook' into unstable-merge

parents dae0243d 30c1dbdc
from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint, Index
from sqlalchemy.orm import relationship
from sqlalchemy.orm import relationship, validates
from sqlalchemy.types import TypeDecorator, \
Integer, Float, Boolean, DateTime, String, Text
from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION as Double
......@@ -7,6 +7,7 @@ from sqlalchemy.ext.mutable import MutableDict, MutableList
from sqlalchemy.ext.declarative import declarative_base
__all__ = ["Column", "ForeignKey", "UniqueConstraint", "relationship",
"validates", "ValidatorMixin",
"Integer", "Float", "Boolean", "DateTime", "String", "Text",
"TypeDecorator",
"JSONB", "Double",
......@@ -18,6 +19,25 @@ __all__ = ["Column", "ForeignKey", "UniqueConstraint", "relationship",
# all tables handled by Alembic migration scripts.
Base = declarative_base()
# To be used by tables already handled by Django ORM, such as User model. We
# separate them in order to keep those out of Alembic sight.
DjangoBase = declarative_base()
class ValidatorMixin(object):
def enforce_length(self, key, value):
"""Truncate a string according to its column length
Usage example:
.. code-block:: python
@validates('some_column')
def validate_some_column(self, key, value):
self.enforce_length(key, value)
"""
max_len = getattr(self.__class__, key).prop.columns[0].type.length
if value and len(value) > max_len:
return value[:max_len]
return value
......@@ -9,7 +9,7 @@ from datetime import datetime
from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index, \
Integer, Float, String, DateTime, JSONB, \
MutableList, MutableDict
MutableList, MutableDict, validates, ValidatorMixin
from .users import User
__all__ = ['Node', 'NodeNode', 'CorpusNode']
......@@ -26,7 +26,7 @@ class NodeType(TypeDecorator):
return NODETYPES[typeindex]
class Node(Base):
class Node(ValidatorMixin, Base):
"""This model can fit many purposes:
myFirstCorpus = session.query(CorpusNode).first()
......@@ -112,6 +112,10 @@ class Node(Base):
'user_id={0.user_id}, parent_id={0.parent_id}, ' \
'name={0.name!r}, date={0.date})>'.format(self)
@validates('name')
def validate_name(self, key, value):
return self.enforce_length(key, value)
@property
def ngrams(self):
"""Pseudo-attribute allowing to retrieve a node's ngrams.
......
......@@ -73,7 +73,8 @@ from rest_framework.views import APIView
from gargantext.util.json import json_encoder
def JsonHttpResponse(data, status=200):
return HttpResponse(
content = json_encoder.encode(data),
content = data.encode('utf-8') if isinstance(data, str) else \
json_encoder.encode(data),
content_type = 'application/json; charset=utf-8',
status = status
)
......
#!/usr/bin/env python
"""
Gargantext Software Copyright (c) 2016-2017 CNRS ISC-PIF -
http://iscpif.fr
......@@ -6,45 +7,29 @@ http://gitlab.iscpif.fr/humanities/gargantext/blob/stable/LICENSE )
- In France : a CECILL variant affero compliant
- GNU aGPLV3 for all other countries
"""
#!/usr/bin/env python
import sys
import os
import os
import django
# Django settings
dirname = os.path.dirname(os.path.realpath(__file__))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext.settings")
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
django.setup()
# initialize Django application
from django.core.wsgi import get_wsgi_application
application = get_wsgi_application()
from gargantext.constants import QUERY_SIZE_N_MAX, get_resource, get_resource_by_name
from gargantext.models import ProjectNode, DocumentNode, UserNode, User
from gargantext.util.db import session, get_engine
from collections import Counter
import importlib
from django.http import Http404
from gargantext.util.toolchain.main import parse_extract_indexhyperdata
from gargantext.util.db import *
from gargantext.models import Node
from gargantext.util.toolchain.main import parse_extract_indexhyperdata
from nltk.tokenize import wordpunct_tokenize
from gargantext.models import *
from nltk.tokenize import word_tokenize
import nltk as nltk
from statistics import mean
from math import log
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import datetime
class NotebookError(Exception):
pass
from collections import Counter
from langdetect import detect as detect_lang
def documents(corpus_id):
return (session.query(Node).filter( Node.parent_id==corpus_id
, Node.typename=="DOCUMENT"
)
# .order_by(Node.hyperdata['publication_date'])
.all()
)
return (session.query(DocumentNode).filter_by(parent_id=corpus_id)
#.order_by(Node.hyperdata['publication_date'])
.all())
#import seaborn as sns
......@@ -56,18 +41,21 @@ def chart(docs, field):
frame1 = pd.DataFrame(year_publis, columns=['Date', 'DateValue'], index=frame0.Date)
return frame1
from gargantext.util.crawlers.HAL import HalCrawler
def scan_hal(request):
hal = HalCrawler()
return hal.scan_results(request)
def scan_gargantext(corpus_id, lang, request):
connection = get_engine().connect()
# TODO add some sugar the request (ideally request should be the same for hal and garg)
query = """select count(n.id) from nodes n
where to_tsvector('%s', hyperdata ->> 'abstract' || 'title')
where to_tsvector('%s', hyperdata ->> 'abstract' || 'title')
@@ to_tsquery('%s')
AND n.parent_id = %s;""" % (lang, request, corpus_id)
AND n.parent_id = %s;""" % (lang, request, corpus_id)
return [i for i in connection.execute(query)][0][0]
connection.close()
......@@ -77,47 +65,117 @@ def myProject_fromUrl(url):
myProject :: String -> Project
"""
project_id = url.split("/")[4]
project = session.query(Node).filter(Node.id == project_id).first()
project = session.query(ProjectNode).get(project_id)
return project
def newCorpus(project, resourceName=11, name="Machine learning", query="LSTM"):
print("Corpus \"%s\" in project \"%s\" created" % (name, project.name))
corpus = project.add_child(name="Corpus name", typename='CORPUS')
corpus.hyperdata["resources"] = [{"extracted" : "true", "type" : 11}]
corpus.hyperdata["statuses"] = [{"action" : "notebook", "complete" : "true"}]
# [TODO] Add informations needed to get buttons on the Project view.
session.add(corpus)
session.commit()
hal = HalCrawler()
max_result = hal.scan_results(query)
paging = 100
for page in range(0, max_result, paging):
print("%s documents downloaded / %s." % (str( paging * (page +1)), str(max_result) ))
docs = (hal._get(query, fromPage=page, count=paging)
.get("response", {})
.get("docs", [])
)
from gargantext.util.parsers.HAL import HalParser
# [TODO] fix boilerplate for docs here
new_docs = HalParser(docs)._parse(docs)
for doc in new_docs:
new_doc = (corpus.add_child( name = doc["title"][:255]
, typename = 'DOCUMENT')
)
new_doc["hyperdata"] = doc
session.add(new_doc)
session.commit()
print("Extracting the ngrams")
parse_extract_indexhyperdata(corpus)
print("Corpus is ready to explore:")
print("http://imt.gargantext.org/projects/%s/corpora/%s/" % (project.id, corpus.id))
return corpus
def newCorpus(project, source, name=None, query=None):
error = False
if name is None:
name = query
if not isinstance(project, ProjectNode):
error = "a valid project"
if not isinstance(source, int) and not isinstance(source, str):
error = "a valid source identifier: id or name"
elif not isinstance(query, str):
error = "a valid query"
elif not isinstance(name, str):
error = "a valid name"
if error:
raise NotebookError("Please provide %s." % error)
resource = get_resource(source) if isinstance(source, int) else \
get_resource_by_name(source)
moissonneur_name = get_moissonneur_name(resource) if resource else \
source.lower()
try:
moissonneur = get_moissonneur(moissonneur_name)
except ImportError:
raise NotebookError("Invalid source identifier: %r" % source)
return run_moissonneur(moissonneur, project, name, query)
def get_moissonneur_name(ident):
""" Return moissonneur module name from RESOURCETYPE or crawler name """
# Does it quacks like a RESOURCETYPE ?
if hasattr(ident, 'get'):
ident = ident.get('crawler')
# Extract name from crawler class name, otherwise assume ident is already
# a moissonneur name.
if isinstance(ident, str) and ident.endswith('Crawler'):
return ident[:-len('Crawler')].lower()
def get_moissonneur(name):
""" Return moissonneur module from its name """
if not isinstance(name, str) or not name.islower():
raise NotebookError("Invalid moissonneur name: %r" % name)
module = importlib.import_module('moissonneurs.%s' % name)
module.name = name
return module
def run_moissonneur(moissonneur, project, name, query):
""" Run moissonneur and return resulting corpus """
# XXX Uber-kludge with gory details. Spaghetti rulezzzzz!
class Dummy(object):
pass
request = Dummy()
request.method = 'POST'
request.path = 'nowhere'
request.META = {}
# XXX 'string' only have effect on moissonneurs.pubmed; its value is added
# when processing request client-side, take a deep breath and see
# templates/projects/project.html for more details.
request.POST = {'string': name,
'query': query,
'N': QUERY_SIZE_N_MAX}
request.user = Dummy()
request.user.id = project.user_id
request.user.is_authenticated = lambda: True
if moissonneur.name == 'istex':
# Replace ALL spaces by plus signs
request.POST['query'] = '+'.join(filter(None, query.split(' ')))
try:
import json
r = moissonneur.query(request)
raw_json = r.content.decode('utf-8')
data = json.loads(raw_json)
if moissonneur.name == 'pubmed':
count = sum(x['count'] for x in data)
request.POST['query'] = raw_json
elif moissonneur.name == 'istex':
count = data.get('total', 0)
else:
count = data.get('results_nb', 0)
if count > 0:
corpus = moissonneur.save(request, project.id, return_corpus=True)
else:
return None
except (ValueError, Http404) as e:
raise e
# Sometimes strange things happens...
if corpus.name != name:
corpus.name = name
session.commit()
return corpus
......@@ -30,7 +30,7 @@ def query( request):
#ids = crawlerbot.get_ids(query)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id):
def save(request, project_id, return_corpus=False):
'''save'''
if request.method == "POST":
......@@ -101,6 +101,9 @@ def save(request, project_id):
session.rollback()
# --------------------------------------------
if return_corpus:
return corpus
return render(
template_name = 'pages/projects/wait.html',
request = request,
......
......@@ -33,7 +33,7 @@ def query( request):
print(results)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id):
def save(request, project_id, return_corpus=False):
'''save'''
if request.method == "POST":
......@@ -103,6 +103,9 @@ def save(request, project_id):
session.rollback()
# --------------------------------------------
if return_corpus:
return corpus
return render(
template_name = 'pages/projects/wait.html',
request = request,
......
......@@ -29,7 +29,7 @@ def query( request):
#ids = crawlerbot.get_ids(query)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id):
def save(request, project_id, return_corpus=False):
'''save'''
if request.method == "POST":
......@@ -100,6 +100,9 @@ def save(request, project_id):
session.rollback()
# --------------------------------------------
if return_corpus:
return corpus
return render(
template_name = 'pages/projects/wait.html',
request = request,
......
......@@ -52,7 +52,7 @@ def query( request ):
def save(request , project_id):
def save(request , project_id, return_corpus=False):
print("testISTEX:")
print(request.method)
alist = ["bar","foo"]
......@@ -171,6 +171,9 @@ def save(request , project_id):
session.rollback()
# --------------------------------------------
if return_corpus:
return corpus
return render(
template_name = 'pages/projects/wait.html',
request = request,
......
......@@ -33,7 +33,7 @@ def query( request):
print(results)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id):
def save(request, project_id, return_corpus=False):
'''save'''
if request.method == "POST":
......@@ -104,6 +104,9 @@ def save(request, project_id):
session.rollback()
# --------------------------------------------
if return_corpus:
return corpus
return render(
template_name = 'pages/projects/wait.html',
request = request,
......
......@@ -69,7 +69,7 @@ def query( request ):
return JsonHttpResponse(data)
def save( request , project_id ) :
def save( request , project_id, return_corpus=False ) :
# implicit global session
# do we have a valid project id?
try:
......@@ -164,6 +164,10 @@ def save( request , project_id ) :
session.rollback()
# --------------------------------------------
sleep(1)
if return_corpus:
return corpus
return HttpResponseRedirect('/projects/' + str(project_id))
data = alist
......
......@@ -2,10 +2,7 @@
"cells": [
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"metadata": {},
"source": [
"# Advanced Gargantext Tutorial (Python)"
]
......@@ -13,9 +10,7 @@
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
......@@ -25,11 +20,7 @@
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [],
"source": [
"# First import the library Gargantext Notebook\n",
......@@ -41,17 +32,19 @@
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Corpus \"Machine learning\" in project \"Tests\" created\n",
"100 documents downloaded / 105.\n",
"105\n",
"LSTM 1000\n",
"Downloading page 0 to 100 results\n",
"Downloading page 100 to 100 results\n",
"CORPUS #17058\n",
"PARSING\n",
"Loading available PARSERS:\n",
"\t- EuropresseParser\n",
"\t- RISParser\n",
......@@ -64,85 +57,64 @@
"\t- CernParser\n",
"\t- MultivacParser\n",
"\t- HalParser\n",
"\t- IsidoreParser\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Exception ignored in: <bound method Parser.__del__ of <gargantext.util.parsers.HAL.HalParser object at 0x7f8d1d70ad30>>\n",
"Traceback (most recent call last):\n",
" File \"/srv/gargantext/gargantext/util/parsers/_Parser.py\", line 24, in __del__\n",
" self._file.close()\n",
"AttributeError: 'list' object has no attribute 'close'\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"10100 documents downloaded / 105.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Exception ignored in: <bound method Parser.__del__ of <gargantext.util.parsers.HAL.HalParser object at 0x7f8d1d6f1e10>>\n",
"Traceback (most recent call last):\n",
" File \"/srv/gargantext/gargantext/util/parsers/_Parser.py\", line 24, in __del__\n",
" self._file.close()\n",
"AttributeError: 'list' object has no attribute 'close'\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Extracting the ngrams\n",
"CORPUS #300990\n",
"CORPUS #300990: parsed 105\n"
]
},
{
"ename": "KeyError",
"evalue": "'languages'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-3-c08deaa02bf2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mproject\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmyProject_fromUrl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"http://imt.gargantext.org/projects/300535\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mcorpus\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnewCorpus\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mproject\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"Machine learning\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquery\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"LSTM\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/srv/gargantext/gargantext_notebook.py\u001b[0m in \u001b[0;36mnewCorpus\u001b[0;34m(project, resourceName, name, query)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Extracting the ngrams\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 117\u001b[0;31m \u001b[0mparse_extract_indexhyperdata\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 118\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 119\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Corpus is ready to explore:\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/celery/local.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *a, **kw)\u001b[0m\n\u001b[1;32m 186\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 187\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 188\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_current_object\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 189\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 190\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/alexandre/local/logiciels/python/env/3.5_20170123/lib/python3.5/site-packages/celery/app/task.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 426\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__self__\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 427\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__self__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 428\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 429\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 430\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/srv/gargantext/gargantext/util/toolchain/main.py\u001b[0m in \u001b[0;36mparse_extract_indexhyperdata\u001b[0;34m(corpus)\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mdocs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcorpus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"DOCUMENT\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'CORPUS #%d: parsed %d'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdocs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0mextract_ngrams\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;31m# Preparing Databse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/srv/gargantext/gargantext/util/toolchain/ngrams_extraction.py\u001b[0m in \u001b[0;36mextract_ngrams\u001b[0;34m(corpus, keys, do_subngrams)\u001b[0m\n\u001b[1;32m 151\u001b[0m \u001b[0mcorpus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Ngrams'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merror\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[0mcorpus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_hyperdata\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 153\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 154\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 155\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/srv/gargantext/gargantext/util/toolchain/ngrams_extraction.py\u001b[0m in \u001b[0;36mextract_ngrams\u001b[0;34m(corpus, keys, do_subngrams)\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0;31m#load available taggers for default langage of plateform\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;31m#print(LANGUAGES.keys())\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m tagger_bots = {lang: load_tagger(lang) for lang in corpus.hyperdata[\"languages\"] \\\n\u001b[0m\u001b[1;32m 57\u001b[0m if lang != \"__unknown__\"}\n\u001b[1;32m 58\u001b[0m \u001b[0mtagger_bots\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"__unknown__\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_tagger\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"en\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: 'languages'"
"\t- IsidoreParser\n",
"0 docs skipped\n",
"105 parsed\n",
"#MAIN language of the CORPUS __unknown__\n",
"CORPUS #17058: parsed 105\n",
"INTEGRATE\n",
"INTEGRATE\n",
"CORPUS #17058: extracted ngrams\n",
"CORPUS #17058: indexed hyperdata\n",
"CORPUS #17058: [2017-08-11_11:21:18] new favorites node #17164\n",
"CORPUS #17058: [2017-08-11_11:21:18] starting ngram lists computation\n",
"CORPUS #17058: [2017-08-11_11:21:18] new stoplist node #17165\n",
"# STEMMERS LOADED {'__unknown__': <nltk.stem.snowball.SnowballStemmer object at 0x7fb811588e48>}\n",
"#SUPPORTED STEMMERS LANGS []\n",
"CORPUS #17058: [2017-08-11_11:21:20] new grouplist node #17166\n",
"CORPUS #17058: [2017-08-11_11:21:20] new occs node #17167\n",
"compute_ti_ranking\n",
"2017-08-11_11:21:20 : Starting Query tf_nd_query\n",
"2017-08-11_11:21:21 : End Query tf_nd_quer\n",
"2017-08-11_11:21:21 : tfidfsum\n",
"CORPUS #17058: [2017-08-11_11:21:21] new ti ranking node #17168\n",
"MAINLIST: keeping 2908 ngrams out of 3878\n",
"CORPUS #17058: [2017-08-11_11:21:21] new mainlist node #17169\n",
"Compute TFIDF local\n",
"CORPUS #17058: [2017-08-11_11:21:22] new localtfidf node #17170\n",
"COOCS: NEW matrix shape [220x807]\n",
"CORPUS #17058: [2017-08-11_11:21:23] computed mainlist coocs for specif rank\n",
"SPECIFICITY: computing on 154 ngrams\n",
"CORPUS #17058: [2017-08-11_11:21:23] new spec-clusion node #17171\n",
"CORPUS #17058: [2017-08-11_11:21:23] new gen-clusion node #17172\n",
"MAPLIST quotas: {'topgen': {'multigrams': 168, 'monograms': 42}, 'topspec': {'multigrams': 112, 'monograms': 28}}\n",
"MAPLIST: top_spec_monograms = 28\n",
"MAPLIST: top_spec_multigrams = 41\n",
"MAPLIST: top_gen_monograms = 42\n",
"MAPLIST: top_gen_multigrams = 0\n",
"MAPLIST: kept 111 ngrams in total \n",
"CORPUS #17058: [2017-08-11_11:21:23] new maplist node #17173\n",
"CORPUS #17058: [2017-08-11_11:21:23] FINISHED ngram lists computation\n"
]
}
],
"source": [
"project = myProject_fromUrl(\"http://imt.gargantext.org/projects/300535\")\n",
"corpus = newCorpus(project, name=\"Machine learning\", query=\"LSTM\")"
"#project = myProject_fromUrl(\"http://imt.gargantext.org/projects/300535\")\n",
"project = myProject_fromUrl(\"http://localhost:8000/projects/2\")\n",
"corpus = newCorpus(project, source=\"hal\", name=\"Machine learning\", query=\"LSTM\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"metadata": {},
"outputs": [],
"source": [
"session.query(Node.hyperdata[\"\"])"
......@@ -151,27 +123,21 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"metadata": {},
"outputs": [],
"source": [
"\n",
......@@ -181,9 +147,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"metadata": {},
"outputs": [],
"source": [
"for doc in new_docs:\n",
......@@ -218,11 +182,7 @@
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [
{
"name": "stdout",
......@@ -244,11 +204,7 @@
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [],
"source": [
"# To get all the documents:\n",
......@@ -258,11 +214,7 @@
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [
{
"data": {
......@@ -284,11 +236,7 @@
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [
{
"data": {
......@@ -309,11 +257,7 @@
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [
{
"data": {
......@@ -334,11 +278,7 @@
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [
{
"data": {
......@@ -360,9 +300,7 @@
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": [
......@@ -373,11 +311,7 @@
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [
{
"data": {
......@@ -406,10 +340,7 @@
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"metadata": {},
"source": [
"## Title\n",
"\n",
......@@ -422,19 +353,14 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"metadata": {},
"source": [
"# Lang Cleaning tools"
]
......@@ -442,11 +368,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [],
"source": [
"detect_lang(\"Ceci est une phrase en français.\")"
......@@ -455,11 +377,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [],
"source": [
"detect_lang(\"This is an english sentence.\")"
......@@ -468,11 +386,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [],
"source": [
"chart(docs, \"language_iso2\").plot.bar()"
......@@ -481,11 +395,7 @@
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [
{
"data": {
......@@ -516,9 +426,7 @@
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": [
......@@ -534,11 +442,7 @@
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [
{
"data": {
......@@ -558,11 +462,7 @@
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [
{
"data": {
......@@ -606,9 +506,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": [
......@@ -617,10 +515,7 @@
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"metadata": {},
"source": [
"# Measures IMT Tools"
]
......@@ -628,11 +523,7 @@
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [
{
"ename": "ConnectionError",
......@@ -680,11 +571,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [],
"source": [
"# Request syntax\n",
......@@ -701,11 +588,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [],
"source": [
"# Forces / Faiblesses de l'IMT\n",
......@@ -720,9 +603,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": [
......@@ -734,11 +615,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [],
"source": [
"# Then chart it to see your strenght and weakness!\n",
......@@ -749,9 +626,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": []
......@@ -760,9 +635,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": []
......@@ -771,9 +644,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": []
......@@ -782,9 +653,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": []
......@@ -793,9 +662,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": []
......@@ -804,19 +671,14 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"metadata": {},
"source": [
"# Graph generation"
]
......@@ -825,9 +687,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": [
......@@ -838,9 +698,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": [
......@@ -849,10 +707,7 @@
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"metadata": {},
"source": [
"# List Management"
]
......@@ -861,9 +716,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": [
......@@ -874,9 +727,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": [
......@@ -900,7 +751,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3rc1"
"version": "3.5.3"
}
},
"nbformat": 4,
......
......@@ -57,7 +57,7 @@
<center id="corpus" class="help">
<a data-toggle="modal" href="#addcorpus" >
<button
type="button"
......@@ -532,7 +532,7 @@
$("#submit_thing").html("Process a {{ query_size }} sample!")
thequeries = data
var N=0,k=0;
var N=0;
for(var i in thequeries) N += thequeries[i].count
if( N>0) {
......@@ -571,12 +571,11 @@
$("#submit_thing").html("Process a {{ query_size }} sample!")
thequeries = data
var N=data.length,k=0;
// for(var i in thequeries) N += thequeries[i].count
if( N>1) {
var total = JSON.parse(data).total
console.log("N: "+total)
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+total+" publications.</i><br>")
var N = data.total;
if (N > 0) {
console.log("N: "+N)
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications.</i><br>")
$('#submit_thing').prop('disabled', false);
} else {
$("#theresults").html("<i> <b>"+data[0]+"</b></i><br>")
......@@ -661,7 +660,7 @@
console.log(data)
console.log("SUCCESS")
console.log("enabling "+"#"+value.id)
// $("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!")
......@@ -721,7 +720,7 @@
console.log(data)
console.log("SUCCESS")
console.log("enabling "+"#"+value.id)
// $("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!")
......@@ -781,7 +780,7 @@
console.log(data)
console.log("SUCCESS")
console.log("enabling "+"#"+value.id)
// $("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!")
......@@ -876,12 +875,12 @@
console.log("selected:", selectedId);
// by typeID: 3 = PUBMED, 8 = ISTEX, 9 = CERN
if ( selectedId == "3"
|| selectedId == "8"
|| selectedId == "9"
|| selectedId == "10"
|| selectedId == "11"
|| selectedId == "12"
if ( selectedId == "3"
|| selectedId == "8"
|| selectedId == "9"
|| selectedId == "10"
|| selectedId == "11"
|| selectedId == "12"
) {
console.log("show the button for: " + selectedId)
$("#div-fileornot").css("visibility", "visible");
......@@ -1019,16 +1018,16 @@
function saveMultivac(query, N){
console.log("In Multivac")
if(!query || query=="") return;
console.log(query)
//var origQuery = query
var data = { "query" : query , "N": N };
// Replace all the slashes
var projectid = window.location.href.split("projects")[1].replace(/\//g, '')
console.log(data)
$.ajax({
dataType: 'json',
......@@ -1066,16 +1065,16 @@
function save(query, N, urlGarg){
console.log("In Gargantext")
if(!query || query=="") return;
console.log(query)
//var origQuery = query
var data = { "query" : query , "N": N };
// Replace all the slashes
var projectid = window.location.href.split("projects")[1].replace(/\//g, '')
console.log(data)
$.ajax({
dataType: 'json',
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment