Commit dae0243d authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch 'unstable-notebook' into unstable-merge

parents c59c147f de5325cb
......@@ -113,7 +113,7 @@ class HalCrawler(Crawler):
msg = "Invalid sample size N = %i (max = %i)" % ( self.query_max
, QUERY_SIZE_N_MAX
)
print("ERROR (scrap: Multivac d/l ): " , msg)
print("ERROR (scrap: HAL d/l ): " , msg)
self.query_max = QUERY_SIZE_N_MAX
#for page in range(1, trunc(self.query_max / 100) + 2):
......
......@@ -11,17 +11,8 @@ from datetime import datetime
import json
class HalParser(Parser):
def parse(self, filebuf):
'''
parse :: FileBuff -> [Hyperdata]
'''
contents = filebuf.read().decode("UTF-8")
data = json.loads(contents)
def _parse(self, json_docs):
filebuf.close()
json_docs = data
hyperdata_list = []
hyperdata_path = { "id" : "isbn_s"
......@@ -73,3 +64,13 @@ class HalParser(Parser):
hyperdata_list.append(hyperdata)
return hyperdata_list
def parse(self, filebuf):
'''
parse :: FileBuff -> [Hyperdata]
'''
contents = filebuf.read().decode("UTF-8")
data = json.loads(contents)
return self._parse(data)
......@@ -16,7 +16,7 @@ sudo docker run \
--env POSTGRES_HOST=localhost \
-v /srv/gargantext:/srv/gargantext \
-it garg-notebook:latest \
/bin/bash -c "/bin/su notebooks -c 'source /env_3-5/bin/activate && cd /srv/gargantext/ && jupyter notebook --port=8899 --ip=0.0.0.0 --no-browser'"
/bin/bash -c "/bin/su notebooks -c 'source /env_3-5/bin/activate && cd /home/notebooks && jupyter notebook --port=8899 --ip=0.0.0.0 --no-browser'"
# #&& jupyter nbextension enable --py widgetsnbextension --sys-prefix
#/bin/bash -c "/bin/su notebooks -c 'source /env_3-5/bin/activate && cd /srv/gargantext/ && jupyter notebook --port=8899 --ip=0.0.0.0 --no-browser --notebook-dir=/home/notebooks/'"
......
......@@ -78,32 +78,8 @@ RUN . /env_3-5/bin/activate && pip3 install -r requirements.txt
#RUN ./psql_configure.sh
#RUN ./django_configure.sh
RUN chown notebooks:notebooks -R /env_3-5
########################################################################
### Notebook IHaskell and IPYTHON ENVIRONNEMENT
########################################################################
#RUN apt-get update && apt-get install -y \
# libtinfo-dev \
# libzmq3-dev \
# libcairo2-dev \
# libpango1.0-dev \
# libmagic-dev \
# libblas-dev \
# liblapack-dev
#RUN curl -sSL https://get.haskellstack.org/ | sh
#RUN stack setup
#RUN git clone https://github.com/gibiansky/IHaskell
#RUN . /env_3-5/bin/activate \
# && cd IHaskell \
# && stack install gtk2hs-buildtools \
# && stack install --fast \
# && /root/.local/bin/ihaskell install --stack
#
#
########################################################################
### POSTGRESQL DATA (as ROOT)
########################################################################
......@@ -115,3 +91,32 @@ RUN chown notebooks:notebooks -R /env_3-5
EXPOSE 5432 8899
VOLUME ["/srv/","/home/notebooks/"]
########################################################################
### Notebook IHaskell and IPYTHON ENVIRONNEMENT
########################################################################
RUN apt-get update && apt-get install -y \
libtinfo-dev \
libzmq3-dev \
libcairo2-dev \
libpango1.0-dev \
libmagic-dev \
libblas-dev \
liblapack-dev
USER notebooks
RUN cd /home/notebooks \
&& curl -sSL https://get.haskellstack.org/ | sh \
&& stack setup \
&& git clone https://github.com/gibiansky/IHaskell \
&& . /env_3-5/bin/activate \
&& cd IHaskell \
&& stack install gtk2hs-buildtools \
&& stack install --fast \
&& /root/.local/bin/ihaskell install --stack
......@@ -22,7 +22,7 @@ application = get_wsgi_application()
from gargantext.util.toolchain.main import parse_extract_indexhyperdata
from gargantext.util.db import *
from gargantext.models import Node
from gargantext.util.toolchain.main import parse_extract_indexhyperdata
from nltk.tokenize import wordpunct_tokenize
from gargantext.models import *
......@@ -56,9 +56,7 @@ def chart(docs, field):
frame1 = pd.DataFrame(year_publis, columns=['Date', 'DateValue'], index=frame0.Date)
return frame1
from gargantext.util.crawlers.HAL import HalCrawler
def scan_hal(request):
hal = HalCrawler()
return hal.scan_results(request)
......@@ -73,3 +71,53 @@ def scan_gargantext(corpus_id, lang, request):
return [i for i in connection.execute(query)][0][0]
connection.close()
def myProject_fromUrl(url):
"""
myProject :: String -> Project
"""
project_id = url.split("/")[4]
project = session.query(Node).filter(Node.id == project_id).first()
return project
def newCorpus(project, resourceName=11, name="Machine learning", query="LSTM"):
print("Corpus \"%s\" in project \"%s\" created" % (name, project.name))
corpus = project.add_child(name="Corpus name", typename='CORPUS')
corpus.hyperdata["resources"] = [{"extracted" : "true", "type" : 11}]
corpus.hyperdata["statuses"] = [{"action" : "notebook", "complete" : "true"}]
# [TODO] Add informations needed to get buttons on the Project view.
session.add(corpus)
session.commit()
hal = HalCrawler()
max_result = hal.scan_results(query)
paging = 100
for page in range(0, max_result, paging):
print("%s documents downloaded / %s." % (str( paging * (page +1)), str(max_result) ))
docs = (hal._get(query, fromPage=page, count=paging)
.get("response", {})
.get("docs", [])
)
from gargantext.util.parsers.HAL import HalParser
# [TODO] fix boilerplate for docs here
new_docs = HalParser(docs)._parse(docs)
for doc in new_docs:
new_doc = (corpus.add_child( name = doc["title"][:255]
, typename = 'DOCUMENT')
)
new_doc["hyperdata"] = doc
session.add(new_doc)
session.commit()
print("Extracting the ngrams")
parse_extract_indexhyperdata(corpus)
print("Corpus is ready to explore:")
print("http://imt.gargantext.org/projects/%s/corpora/%s/" % (project.id, corpus.id))
return corpus
......@@ -2,11 +2,38 @@
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Advanced Gargantext Tutorial (Python)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "TypeError",
"evalue": "'list' object is not callable",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-3-a8e3501c9a54>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'/srv/gargantext'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m: 'list' object is not callable"
]
}
],
"source": [
"import sys\n",
"sys.pa"
]
},
{
"cell_type": "code",
"execution_count": 1,
......@@ -28,7 +55,9 @@
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment