Commit 0ab4f20a authored by delanoe's avatar delanoe

[MERGE] Testing 2 stable.

parents 48c7e541 499a52e7
......@@ -2,6 +2,12 @@
* Guided Tour
* Sources form highlighting crawlers
## Version 3.0.6.8
* REPEC Crawler (connection with https://multivac.iscpif.fr)
* HAL Crawler (connection to https://hal.archives-ouvertes.fr/)
* New Graph Feature: color nodes by growth
## Version 3.0.6.4
* COOC SQL improved
......
......@@ -181,8 +181,6 @@ def get_tagger(lang):
return tagger()
RESOURCETYPES = [
{ "type": 1,
'name': 'Europresse',
......@@ -199,7 +197,7 @@ RESOURCETYPES = [
'crawler': None,
},
{ 'type': 3,
'name': 'Pubmed [XML]',
'name': 'Pubmed [CRAWLER/XML]',
'format': 'Pubmed',
'parser': "PubmedParser",
'file_formats':["zip", "xml"],
......@@ -235,26 +233,43 @@ RESOURCETYPES = [
'crawler': None,
},
{ 'type': 8,
'name': 'ISTex',
'name': 'ISTex [CRAWLER]',
'format': 'json',
'parser': "ISTexParser",
'file_formats':["zip", "txt"],
'crawler': None,
},
{ "type": 9,
"name": 'SCOAP [XML]',
"name": 'SCOAP [CRAWLER/XML]',
"parser": "CernParser",
"format": 'MARC21',
'file_formats':["zip","xml"],
"crawler": "CernCrawler",
},
# { "type": 10,
# "name": 'REPEC [RIS]',
# "parser": "RISParser",
# "format": 'RIS',
# 'file_formats':["zip","ris", "txt"],
# "crawler": None,
# },
#
{ "type": 10,
"name": 'REPEC [RIS]',
"parser": "RISParser",
"format": 'RIS',
'file_formats':["zip","ris", "txt"],
"crawler": None,
"name": 'REPEC [CRAWLER]',
"parser": "MultivacParser",
"format": 'JSON',
'file_formats':["zip","json"],
"crawler": "MultivacCrawler",
},
{ "type": 11,
"name": 'HAL [CRAWLER]',
"parser": "HalParser",
"format": 'JSON',
'file_formats':["zip","json"],
"crawler": "HalCrawler",
},
]
#shortcut for resources declaration in template
PARSERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["parser"] is not None]
......
......@@ -28,19 +28,20 @@ import graph.urls
import moissonneurs.urls
urlpatterns = [ url(r'^admin/' , admin.site.urls )
, url(r'^api/' , include( gargantext.views.api.urls ) )
, url(r'^' , include( gargantext.views.pages.urls ) )
urlpatterns = [ url(r'^admin/' , admin.site.urls )
, url(r'^api/' , include( gargantext.views.api.urls ) )
, url(r'^' , include( gargantext.views.pages.urls ) )
, url(r'^favicon.ico$', Redirect.as_view( url=static.url('favicon.ico')
, permanent=False), name="favicon")
, permanent=False), name="favicon" )
# Module Graph
, url(r'^' , include( graph.urls ) )
, url(r'^' , include( graph.urls ) )
# Module Annotation
# tempo: unchanged doc-annotations routes --
, url(r'^annotations/', include( annotations_urls ) )
, url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/(focus=[0-9,]+)?$', annotations_main_view)
, url(r'^annotations/', include( annotations_urls ) )
, url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/(focus=[0-9,]+)?$'
, annotations_main_view)
# Module Scrapers (Moissonneurs in French)
, url(r'^moissonneurs/' , include( moissonneurs.urls ) )
......
......@@ -4,7 +4,7 @@
# ***** CERN Scrapper *****
# ****************************
# Author:c24b
# Date: 27/05/2015
# Date: 27/05/2016
import hmac, hashlib
import requests
import os
......@@ -96,10 +96,12 @@ class CernCrawler(Crawler):
print(self.results_nb, "res")
#self.generate_urls()
return(self.ids)
def generate_urls(self):
''' generate raw urls of ONE record'''
self.urls = ["http://repo.scoap3.org/record/%i/export/xm?ln=en" %rid for rid in self.ids]
return self.urls
def fetch_records(self, ids):
''' for NEXT time'''
raise NotImplementedError
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** HAL Scrapper ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Crawler import *
import json
from gargantext.constants import UPLOAD_DIRECTORY
from math import trunc
from gargantext.util.files import save
class HalCrawler(Crawler):
''' HAL API CLIENT'''
def __init__(self):
# Main EndPoints
self.BASE_URL = "https://api.archives-ouvertes.fr"
self.API_URL = "search"
# Final EndPoints
# TODO : Change endpoint according type of database
self.URL = self.BASE_URL + "/" + self.API_URL
self.status = []
def __format_query__(self, query=None):
'''formating the query'''
#search_field="title_t"
search_field="abstract_t"
return (search_field + ":" + "(" + query + ")")
def _get(self, query, fromPage=1, count=10, lang=None):
# Parameters
fl = """ title_s
, abstract_s
, submittedDate_s
, journalDate_s
, authFullName_s
, uri_s
, isbn_s
, issue_s
, journalPublisher_s
"""
#, authUrl_s
#, type_s
wt = "json"
querystring = { "q" : query
, "rows" : count
, "start" : fromPage
, "fl" : fl
, "wt" : wt
}
# Specify Headers
headers = { "cache-control" : "no-cache" }
# Do Request and get response
response = requests.request( "GET"
, self.URL
, headers = headers
, params = querystring
)
#print(querystring)
# Validation : 200 if ok else raise Value
if response.status_code == 200:
charset = ( response.headers["Content-Type"]
.split("; ")[1]
.split("=" )[1]
)
return (json.loads(response.content.decode(charset)))
else:
raise ValueError(response.status_code, response.reason)
def scan_results(self, query):
'''
scan_results : Returns the number of results
Query String -> Int
'''
self.results_nb = 0
total = ( self._get(query)
.get("response", {})
.get("numFound" , 0)
)
self.results_nb = total
return self.results_nb
def download(self, query):
downloaded = False
self.status.append("fetching results")
corpus = []
paging = 100
self.query_max = self.scan_results(query)
#print("self.query_max : %s" % self.query_max)
if self.query_max > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % ( self.query_max
, QUERY_SIZE_N_MAX
)
print("ERROR (scrap: Multivac d/l ): " , msg)
self.query_max = QUERY_SIZE_N_MAX
#for page in range(1, trunc(self.query_max / 100) + 2):
for page in range(0, self.query_max, paging):
print("Downloading page %s to %s results" % (page, paging))
docs = (self._get(query, fromPage=page, count=paging)
.get("response", {})
.get("docs" , [])
)
for doc in docs:
corpus.append(doc)
self.path = save( json.dumps(corpus).encode("utf-8")
, name='HAL.json'
, basedir=UPLOAD_DIRECTORY
)
downloaded = True
return downloaded
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** MULTIVAC Scrapper ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Crawler import *
import json
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** MULTIVAC Scrapper ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Crawler import *
import json
from gargantext.settings import API_TOKENS
from gargantext.constants import UPLOAD_DIRECTORY
from math import trunc
from gargantext.util.files import save
class MultivacCrawler(Crawler):
''' Multivac API CLIENT'''
def __init__(self):
self.apikey = API_TOKENS["MULTIVAC"]
# Main EndPoints
self.BASE_URL = "https://api.iscpif.fr/v2"
self.API_URL = "pvt/economy/repec/search"
# Final EndPoints
# TODO : Change endpoint according type of database
self.URL = self.BASE_URL + "/" + self.API_URL
self.status = []
def __format_query__(self, query=None):
'''formating the query'''
None
def _get(self, query, fromPage=1, count=10, lang=None):
# Parameters
querystring = { "q" : query
, "count" : count
, "from" : fromPage
, "api_key" : API_TOKENS["MULTIVAC"]["APIKEY"]
}
if lang is not None:
querystring["lang"] = lang
# Specify Headers
headers = { "cache-control" : "no-cache" }
# Do Request and get response
response = requests.request( "GET"
, self.URL
, headers = headers
, params = querystring
)
#print(querystring)
# Validation : 200 if ok else raise Value
if response.status_code == 200:
charset = ( response.headers["Content-Type"]
.split("; ")[1]
.split("=" )[1]
)
return (json.loads(response.content.decode(charset)))
else:
raise ValueError(response.status_code, response.reason)
def scan_results(self, query):
'''
scan_results : Returns the number of results
Query String -> Int
'''
self.results_nb = 0
total = ( self._get(query)
.get("results", {})
.get("total" , 0)
)
self.results_nb = total
return self.results_nb
def download(self, query):
downloaded = False
self.status.append("fetching results")
corpus = []
paging = 100
self.query_max = self.scan_results(query)
#print("self.query_max : %s" % self.query_max)
if self.query_max > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % ( self.query_max
, QUERY_SIZE_N_MAX
)
print("ERROR (scrap: Multivac d/l ): " , msg)
self.query_max = QUERY_SIZE_N_MAX
for page in range(1, trunc(self.query_max / 100) + 2):
print("Downloading page %s to %s results" % (page, paging))
docs = (self._get(query, fromPage=page, count=paging)
.get("results", {})
.get("hits" , [])
)
for doc in docs:
corpus.append(doc)
self.path = save( json.dumps(corpus).encode("utf-8")
, name='Multivac.json'
, basedir=UPLOAD_DIRECTORY
)
downloaded = True
return downloaded
# Scrapers config
QUERY_SIZE_N_MAX = 1000
from gargantext.constants import get_resource
from gargantext.constants import get_resource, QUERY_SIZE_N_MAX
from gargantext.util.scheduling import scheduled
from gargantext.util.db import session
from requests_futures.sessions import FuturesSession
......@@ -18,31 +18,34 @@ class Crawler:
#the name of corpus
#that will be built in case of internal fileparsing
self.record = record
self.name = record["corpus_name"]
self.project_id = record["project_id"]
self.user_id = record["user_id"]
self.resource = record["source"]
self.type = get_resource(self.resource)
self.query = record["query"]
self.record = record
self.name = record["corpus_name"]
self.project_id = record["project_id"]
self.user_id = record["user_id"]
self.resource = record["source"]
self.type = get_resource(self.resource)
self.query = record["query"]
#format the sampling
self.n_last_years = 5
self.YEAR = date.today().year
self.YEAR = date.today().year
#pas glop
# mais easy version
self.MONTH = str(date.today().month)
self.MONTH = str(date.today().month)
if len(self.MONTH) == 1:
self.MONTH = "0"+self.MONTH
self.MAX_RESULTS = 1000
self.MAX_RESULTS = QUERY_SIZE_N_MAX
try:
self.results_nb = int(record["count"])
except KeyError:
#n'existe pas encore
self.results_nb = 0
try:
self.webEnv = record["webEnv"]
self.webEnv = record["webEnv"]
self.queryKey = record["queryKey"]
self.retMax = record["retMax"]
self.retMax = record["retMax"]
except KeyError:
#n'exsite pas encore
self.queryKey = None
......@@ -67,6 +70,7 @@ class Crawler:
if self.download():
self.create_corpus()
return self.corpus_id
def get_sampling_dates():
'''Create a sample list of min and max date based on Y and M f*
or N_LAST_YEARS results'''
......
......@@ -171,3 +171,6 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None, do_stat
cursor.execute('COMMIT WORK;')
cursor.close()
......@@ -8,29 +8,12 @@ import random
_members = [
{ 'first_name' : 'Constance', 'last_name' : 'de Quatrebarbes',
'mail' : '4barbesATgmail.com',
'website' : 'http://c24b.github.io/',
'picture' : 'constance.jpg',
'role' : 'developer'},
{ 'first_name' : 'David', 'last_name' : 'Chavalarias',
'mail' : 'david.chavalariasATiscpif.fr',
'website' : 'http://chavalarias.com',
'picture' : 'david.jpg',
'role':'principal investigator'},
# { 'first_name' : 'Elias', 'last_name' : 'Showk',
# 'mail' : '',
# 'website' : 'https://github.com/elishowk',
# 'picture' : '', 'role' : 'developer'},
{ 'first_name' : 'Mathieu', 'last_name' : 'Rodic',
'mail' : '',
'website' : 'http://rodic.fr',
'picture' : 'mathieu.jpg',
'role' : 'developer'},
{ 'first_name' : 'Samuel', 'last_name' : 'Castillo J.',
'mail' : 'kaisleanATgmail.com',
'website' : 'http://www.pksm3.droppages.com',
......@@ -43,12 +26,6 @@ _members = [
'picture' : 'maziyar.jpg',
'role' : 'developer'},
{ 'first_name' : 'Romain', 'last_name' : 'Loth',
'mail' : '',
'website' : 'http://iscpif.fr',
'picture' : 'romain.jpg',
'role' : 'developer'},
{ 'first_name' : 'Alexandre', 'last_name' : 'Delanoë',
'mail' : 'alexandre+gargantextATdelanoe.org',
'website' : 'http://alexandre.delanoe.org',
......@@ -59,8 +36,33 @@ _members = [
# copy-paste the line above and write your informations please
]
_membersPast = [
{ 'first_name' : 'Constance', 'last_name' : 'de Quatrebarbes',
'mail' : '4barbesATgmail.com',
'website' : 'http://c24b.github.io/',
'picture' : 'constance.jpg',
'role' : 'developer'},
{ 'first_name' : 'Mathieu', 'last_name' : 'Rodic',
'mail' : '',
'website' : 'http://rodic.fr',
'picture' : 'mathieu.jpg',
'role' : 'developer'},
{ 'first_name' : 'Romain', 'last_name' : 'Loth',
'mail' : '',
'website' : 'http://iscpif.fr',
'picture' : 'romain.jpg',
'role' : 'developer'},
{ 'first_name' : 'Elias', 'last_name' : 'Showk',
'mail' : '',
'website' : 'https://github.com/elishowk',
'picture' : '', 'role' : 'developer'},
]
_institutions = [
#{ 'name' : 'Mines ParisTech', 'website' : 'http://mines-paristech.fr', 'picture' : 'mines.png', 'funds':''},
{ 'name' : 'Mines ParisTech', 'website' : 'http://mines-paristech.fr', 'picture' : 'mines.png', 'funds':''},
#{ 'name' : 'Institut Pasteur', 'website' : 'http://www.pasteur.fr', 'picture' : 'pasteur.png', 'funds':''},
{ 'name' : 'EHESS', 'website' : 'http://www.ehess.fr', 'picture' : 'ehess.png', 'funds':''},
#{ 'name' : '', 'website' : '', 'picture' : '', 'funds':''},
......@@ -87,6 +89,10 @@ def members():
random.shuffle(_members)
return _members
def membersPast():
random.shuffle(_membersPast)
return _membersPast
def institutions():
random.shuffle(_institutions)
return _institutions
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** HAL Parser ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Parser import Parser
from datetime import datetime
import json
class HalParser(Parser):
def parse(self, filebuf):
'''
parse :: FileBuff -> [Hyperdata]
'''
contents = filebuf.read().decode("UTF-8")
data = json.loads(contents)
filebuf.close()
json_docs = data
hyperdata_list = []
hyperdata_path = { "id" : "isbn_s"
, "title" : "title_s"
, "abstract" : "abstract_s"
, "source" : "journalPublisher_s"
, "url" : "uri_s"
, "authors" : "authFullName_s"
}
uris = set()
for doc in json_docs:
hyperdata = {}
for key, path in hyperdata_path.items():
field = doc.get(path, "NOT FOUND")
if isinstance(field, list):
hyperdata[key] = ", ".join(field)
else:
hyperdata[key] = field
if hyperdata["url"] in uris:
print("Document already parsed")
else:
uris.add(hyperdata["url"])
# hyperdata["authors"] = ", ".join(
# [ p.get("person", {})
# .get("name" , "")
#
# for p in doc.get("hasauthor", [])
# ]
# )
#
maybeDate = doc.get("submittedDate_s", None)
if maybeDate is not None:
date = datetime.strptime(maybeDate, "%Y-%m-%d %H:%M:%S")
else:
date = datetime.now()
hyperdata["publication_date"] = date
hyperdata["publication_year"] = str(date.year)
hyperdata["publication_month"] = str(date.month)
hyperdata["publication_day"] = str(date.day)
hyperdata_list.append(hyperdata)
return hyperdata_list
......@@ -13,20 +13,21 @@ class ISTexParser(Parser):
hyperdata_list = []
hyperdata_path = {
"id" : "id",
"source" : 'corpusName',
"title" : 'title',
"source" : "corpusName",
"title" : "title",
"genre" : "genre",
"language_iso3" : 'language',
"doi" : 'doi',
"host" : 'host',
"publication_date" : 'publicationDate',
"abstract" : 'abstract',
"language_iso3" : "language",
"doi" : "doi",
"host" : "host",
"publication_date" : "publicationDate",
"abstract" : "abstract",
# "authors" : 'author',
"authorsRAW" : 'author',
"authorsRAW" : "author",
#"keywords" : "keywords"
}
suma = 0
for json_doc in json_docs:
hyperdata = {}
......@@ -103,7 +104,7 @@ class ISTexParser(Parser):
RealDate = RealDate[0]
# print( RealDate ," | length:",len(RealDate))
Decision=""
Decision = True
if len(RealDate)>4:
if len(RealDate)>8:
try: Decision = datetime.strptime(RealDate, '%Y-%b-%d').date()
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** MULTIVAC Parser ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Parser import Parser
from datetime import datetime
import json
class MultivacParser(Parser):
def parse(self, filebuf):
'''
parse :: FileBuff -> [Hyperdata]
'''
contents = filebuf.read().decode("UTF-8")
data = json.loads(contents)
filebuf.close()
json_docs = data
hyperdata_list = []
hyperdata_path = { "id" : "id"
, "title" : "title"
, "abstract" : "abstract"
, "type" : "type"
}
for json_doc in json_docs:
hyperdata = {}
doc = json_doc["_source"]
for key, path in hyperdata_path.items():
hyperdata[key] = doc.get(path, "")
hyperdata["source"] = doc.get("serial" , {})\
.get("journaltitle", "REPEC Database")
try:
hyperdata["url"] = doc.get("file", {})\
.get("url" , "")
except:
pass
hyperdata["authors"] = ", ".join(
[ p.get("person", {})
.get("name" , "")
for p in doc.get("hasauthor", [])
]
)
year = doc.get("serial" , {})\
.get("issuedate", None)
if year == "Invalide date":
year = doc.get("issuedate" , None)
if year is None:
year = datetime.now()
else:
try:
date = datetime.strptime(year, '%Y')
except:
print("FIX DATE MULTIVAC REPEC %s" % year)
date = datetime.now()
hyperdata["publication_date"] = date
hyperdata["publication_year"] = str(date.year)
hyperdata["publication_month"] = str(date.month)
hyperdata["publication_day"] = str(date.day)
hyperdata_list.append(hyperdata)
return hyperdata_list
......@@ -78,7 +78,7 @@ class PubmedParser(Parser):
if "publication_month" in hyperdata: PubmedDate+=" "+hyperdata["publication_month"]
if "publication_day" in hyperdata: PubmedDate+=" "+hyperdata["publication_day"]
Decision=""
Decision=True
if len(RealDate)>4:
if len(RealDate)>8:
try: Decision = datetime.strptime(RealDate, '%Y %b %d').date()
......
......@@ -109,7 +109,7 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
.group_by("counted_form")
)
#print(str(occs_q))
#print(str(occs_q.all()))
occ_sums = occs_q.all()
# example result = [(1970, 1.0), (2024, 2.0), (259, 2.0), (302, 1.0), ... ]
# ^^^^ ^^^
......@@ -177,6 +177,7 @@ def compute_ti_ranking(corpus,
- overwrite_id: optional id of a pre-existing XXXX node for this corpus
(the Node and its previous Node NodeNgram rows will be replaced)
"""
print("compute_ti_ranking")
# validate string params
if count_scope not in ["local","global"]:
raise ValueError("compute_ti_ranking: count_scope param allowed values: 'local', 'global'")
......@@ -189,7 +190,7 @@ def compute_ti_ranking(corpus,
if type(corpus) == int:
corpus_id = corpus
corpus = cache.Node[corpus_id]
elif type(corpus) == str and match(r'\d+$', corpus):
elif type(corpus) == str and match(r'^\d+$', corpus):
corpus_id = int(corpus)
corpus = cache.Node[corpus_id]
else:
......@@ -329,7 +330,7 @@ def compute_ti_ranking(corpus,
# result
print("%s : Starting Query tf_nd_query" % t())
print(str(tf_nd_query))
#print(str(tf_nd_query.all()))
tf_nd = tf_nd_query.all()
print("%s : End Query tf_nd_quer" % t())
......@@ -371,7 +372,7 @@ def compute_ti_ranking(corpus,
# TODO 2 release these 2 typenames TFIDF-CORPUS and TFIDF-GLOBAL
# TODO 3 recreate them elsewhere in their sims (WeightedIndex) version
# TODO 4 requalify this here as a NodeNgram
# then TODO 5 use WeightedList.save() !
# TODO 5 use WeightedList.save()
# reflect that in NodeNodeNgrams
bulk_insert(
......@@ -398,7 +399,8 @@ def compute_tfidf_local(corpus,
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
print("Compute TFIDF local")
# All docs of this corpus
docids_subquery = (session
.query(Node.id)
......
......@@ -3,9 +3,9 @@ COOCS
(this is the full SQL version, should be more reliable on outerjoin)
"""
from gargantext import settings
from sqlalchemy import create_engine
from sqlalchemy import exc
from gargantext.util.lists import WeightedMatrix
# from gargantext.util.db import session, aliased, func
from gargantext.util.db import get_engine
from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD, NODETYPES
from gargantext.constants import INDEXED_HYPERDATA
......@@ -64,12 +64,7 @@ def compute_coocs( corpus,
"""
# 1) prepare direct connection to the DB
url = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{NAME}'.format(
**settings.DATABASES['default']
)
engine = create_engine( url )
connection = engine.connect()
connection = get_engine().connect()
# string vars for our SQL query
# setting work memory high to improve cache perf.
......@@ -223,10 +218,19 @@ def compute_coocs( corpus,
# 6) EXECUTE QUERY
# ----------------
# debug
print(final_sql)
#print(final_sql)
# executing the SQL statement
results = connection.execute(final_sql)
try:
# suppose the database has been restarted.
results = connection.execute(final_sql)
connection.close()
except exc.DBAPIError as e:
# an exception is raised, Connection is invalidated.
if e.connection_invalidated:
print("Connection was invalidated for ngram_coocs")
else:
print(e)
# => storage in our matrix structure
matrix = WeightedMatrix(results)
......
......@@ -47,7 +47,8 @@ def about(request):
context = {
'user': request.user,
'date': datetime.datetime.now(),
'team': credits.members(),
'team' : credits.members(),
'teamPast': credits.membersPast(),
'institutions': credits.institutions(),
'labos': credits.labs(),
'grants': credits.grants(),
......
......@@ -8,6 +8,7 @@ from graph.cooccurrences import countCooccurrences
from graph.distances import clusterByDistances
from graph.bridgeness import filterByBridgeness
from graph.mail_notification import notify_owner
from graph.growth import compute_growth
from gargantext.util.scheduling import scheduled
from gargantext.constants import graph_constraints
......@@ -64,7 +65,15 @@ def compute_graph( corpus_id=None , cooc_id=None
print("GRAPH #%d ... Filtering by bridgeness %d." % (cooc_id, bridgeness))
data = filterByBridgeness(G,partition,ids,weight,bridgeness,"node_link",field1,field2)
if start is not None and end is not None:
growth= dict()
for (ng_id, score) in compute_growth(corpus_id, groupList_id, mapList_id, start, end):
growth[ng_id] = float(score) + 100 # for the normalization, should not be negativ
for node in data['nodes']:
node['attributes']['growth'] = growth[node['id']]
print("GRAPH #%d ... Saving Graph in hyperdata as json." % cooc_id)
node = session.query(Node).filter(Node.id == cooc_id).first()
......@@ -187,7 +196,7 @@ def get_graph( request=None , corpus=None
)
.filter( Start.key == 'publication_date')
.filter( Start.value_utc >= date_start_utc)
)
)
# Filter corpus by date if any end date
......@@ -203,8 +212,7 @@ def get_graph( request=None , corpus=None
)
.filter( End.key == 'publication_date')
.filter( End.value_utc <= date_end_utc )
)
)
# Finally test if the size of the corpora is big enough
# --------------------------------
......@@ -221,10 +229,11 @@ def get_graph( request=None , corpus=None
#, limit=size
)
return {"state" : "saveOnly",
"target_id" : cooc_id,
"target_name": cooc_name,
"target_date": cooc_date}
return { "state" : "saveOnly"
, "target_id" : cooc_id
, "target_name": cooc_name
, "target_date": cooc_date
}
elif corpus_size > graph_constraints['corpusMax']:
# Then compute cooc asynchronously with celery
......@@ -262,5 +271,5 @@ def get_graph( request=None , corpus=None
if len(data) == 0:
print("GRAPH # ... GET_GRAPH: 0 coocs in matrix")
data = {'nodes':[], 'links':[]} # empty data
return data
"""
Computes ngram growth on periods
"""
from gargantext.models import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram
from gargantext.util.db_cache import cache
from gargantext.util.db import session, bulk_insert, aliased, \
func, get_engine # = sqlalchemy.func like sum() or count()
from datetime import datetime
def timeframes(start, end):
"""
timeframes :: String -> String -> (UTCTime, UTCTime, UTCTime)
"""
start = datetime.strptime (str(start), "%Y-%m-%d")
end = datetime.strptime (str(end), "%Y-%m-%d")
date_0 = start - (end - start)
date_1 = start
date_2 = end
return (date_0, date_1, date_2)
def compute_growth(corpus_id, groupList_id, mapList_id, start, end):
"""
compute_graph :: Int -> UTCTime -> UTCTime -> Int -> Int
-> [(Int, Numeric)]
this function uses SQL function in
/srv/gargantext/install/gargamelle/sqlFunctions.sql
First compute occurrences of ngrams in mapList (with groups) on the first
period, then on the second and finally returns growth.
Directly computed with Postgres Database (C) for optimization.
"""
connection = get_engine()
(date_0, date_1, date_2) = timeframes(start, end)
query = """SELECT * FROM OCC_HIST( {corpus_id}
, {groupList_id}
, {mapList_id}
, '{date_0}'
, '{date_1}'
, '{date_2}'
)
""".format( corpus_id = corpus_id
, groupList_id = groupList_id
, mapList_id = mapList_id
, date_0 = date_0
, date_1 = date_1
, date_2 = date_2
)
return(connection.execute(query))
......@@ -19,6 +19,8 @@ def compress_graph(graphdata):
for node in graphdata['nodes']:
node['lb'] = node['label']
del node['label']
#node['attributes']['growth'] = 0.8
node['at'] = node['attributes']
del node['attributes']
......
......@@ -5,13 +5,10 @@ apt-get install -y \
apt-utils ca-certificates locales \
sudo aptitude gcc g++ wget git vim \
build-essential make \
postgresql-9.5 postgresql-client-9.5 postgresql-contrib-9.5 \
postgresql-server-dev-9.5 libpq-dev libxml2 \
postgresql-9.5 postgresql-client-9.5 postgresql-contrib-9.5 \
nginx rabbitmq-server
# WARNING: uwsgi is not on stretch any more (get it from unstable)
# uwsgi uwsgi-core uwsgi-plugin-python3
postgresql-9.6 postgresql-client-9.6 postgresql-contrib-9.6 \
postgresql-server-dev-9.6 libpq-dev libxml2 \
postgresql-9.6 postgresql-client-9.6 postgresql-contrib-9.6 \
nginx rabbitmq-server uwsgi uwsgi-core uwsgi-plugin-python3
### Configure timezone and locale
......@@ -32,15 +29,15 @@ update-locale LC_ALL=fr_FR.UTF-8
### Install main dependencies and python packages based on Debian distrib
echo "############# PYTHON DEPENDENCIES ###############"
apt-get update && apt-get install -y \
libxml2-dev xml-core libgfortran-5-dev \
libxml2-dev xml-core libgfortran-6-dev \
libpq-dev \
python3.5 \
python3-dev \
python3-six python3-numpy python3-setuptools \
python3-numexpr \
python3-pip \
libxml2-dev libxslt-dev
#libxslt1-dev zlib1g-dev
libxml2-dev libxslt-dev zlib1g-dev
#libxslt1-dev
UPDATE AND CLEAN
apt-get update && apt-get autoclean
......@@ -70,7 +67,7 @@ update-locale LC_ALL=fr_FR.UTF-8
## POSTGRESQL DATA (as ROOT)
#######################################################################
sed -iP "s%^data_directory.*%data_directory = \'\/srv\/gargandata\'%" /etc/postgresql/9.5/main/postgresql.conf
echo "host all all 0.0.0.0/0 md5" >> /etc/postgresql/9.5/main/pg_hba.conf
echo "listen_addresses='*'" >> /etc/postgresql/9.5/main/postgresql.conf
sed -iP "s%^data_directory.*%data_directory = \'\/srv\/gargandata\'%" /etc/postgresql/9.6/main/postgresql.conf
echo "host all all 0.0.0.0/0 md5" >> /etc/postgresql/9.6/main/pg_hba.conf
echo "listen_addresses='*'" >> /etc/postgresql/9.6/main/postgresql.conf
-- CNRS Copyrights 2017
-- See Gargantext Licence for details
-- Maintainers: team@gargantext.org
-- USAGE
-- psql gargandb < occ_growth.sql
-- OCC_HIST :: Corpus.id -> GroupList.id -> MapList.id -> Start -> EndFirst -> EndLast
-- EXEMPLE USAGE
-- SELECT * FROM OCC_HIST(182856, 183859, 183866, '1800-03-15 17:00:00+01', '2000-03-15 17:00:00+01', '2017-03-15 17:00:00+01')
-- OCC_HIST_PART :: Corpus.id -> GroupList.id -> Start -> End
DROP FUNCTION OCC_HIST_PART(integer, integer, timestamp without time zone, timestamp without time zone);
-- DROP for tests
CREATE OR REPLACE FUNCTION OCC_HIST_PART(int, int, timestamp, timestamp) RETURNS TABLE (ng_id int, score float8)
AS $$
-- EXPLAIN ANALYZE
SELECT
COALESCE(gr.ngram1_id, ng1.ngram_id) as ng_id,
SUM(ng1.weight) as score
from nodes n
-- BEFORE
INNER JOIN nodes as n1 ON n1.id = n.id
INNER JOIN nodes_ngrams ng1 ON ng1.node_id = n1.id
-- Limit with timestamps: ]start, end]
INNER JOIN nodes_hyperdata nh1 ON nh1.node_id = n1.id
AND nh1.value_utc > $3
AND nh1.value_utc <= $4
-- Group List
LEFT JOIN nodes_ngrams_ngrams gr ON ng1.ngram_id = gr.ngram2_id
AND gr.node_id = $2
WHERE
n.typename = 4
AND n.parent_id = $1
GROUP BY 1
$$
LANGUAGE SQL;
DROP FUNCTION OCC_HIST(integer, integer, integer, timestamp without time zone, timestamp without time zone, timestamp without time zone);
-- OCC_HIST :: Corpus.id -> GroupList.id -> MapList.id -> Start -> EndFirst -> EndLast
CREATE OR REPLACE FUNCTION OCC_HIST(int, int, int, timestamp, timestamp, timestamp) RETURNS TABLE (ng_id int, score numeric)
AS $$
WITH OCC1 as (SELECT * from OCC_HIST_PART($1, $2, $4, $5))
, OCC2 as (SELECT * from OCC_HIST_PART($1, $2, $5, $6))
, GROWTH as (SELECT ml.ngram_id as ngram_id
, COALESCE(OCC1.score, null) as score1
, COALESCE(OCC2.score, null) as score2
FROM nodes_ngrams ml
LEFT JOIN OCC1 ON OCC1.ng_id = ml.ngram_id
LEFT JOIN OCC2 ON OCC2.ng_id = ml.ngram_id
WHERE ml.node_id = $3
ORDER by score2 DESC)
SELECT ngram_id, COALESCE(ROUND(CAST((100 * (score2 - score1) / COALESCE((score2 + score1), 1)) as numeric), 2), 0) from GROWTH
$$
LANGUAGE SQL;
-- BEHAVIORAL TEST (should be equal to occ in terms table)
-- WITH OCC as (SELECT * from OCC_HIST(182856, 183859, '1800-03-15 17:00:00+01', '2300-03-15 17:00:00+01'))
-- SELECT ng_id, score from OCC
-- INNER JOIN nodes_ngrams ml on ml.ngram_id = ng_id
-- AND ml.node_id = 183866
-- ORDER BY score DESC;
......@@ -12,12 +12,12 @@ echo "::::: POSTGRESQL :::::"
su postgres -c 'pg_dropcluster 9.4 main --stop'
#done in docker but redoing it
rm -rf /srv/gargandata && mkdir /srv/gargandata && chown postgres:postgres /srv/gargandata
su postgres -c '/usr/lib/postgresql/9.5/bin/initdb -D /srv/gargandata/'
su postgres -c '/usr/lib/postgresql/9.5/bin/pg_ctl -D /srv/gargandata/ -l /srv/gargandata/journal_applicatif start'
su postgres -c '/usr/lib/postgresql/9.6/bin/initdb -D /srv/gargandata/'
su postgres -c '/usr/lib/postgresql/9.6/bin/pg_ctl -D /srv/gargandata/ -l /srv/gargandata/journal_applicatif start'
su postgres -c 'pg_createcluster -D /srv/gargandata 9.5 main '
su postgres -c 'pg_ctlcluster -D /srv/gargandata 9.5 main start '
su postgres -c 'pg_ctlcluster 9.5 main start'
su postgres -c 'pg_createcluster -D /srv/gargandata 9.6 main '
su postgres -c 'pg_ctlcluster -D /srv/gargandata 9.6 main start '
su postgres -c 'pg_ctlcluster 9.6 main start'
service postgresql start
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** HAL Crawler *****
# ****************************
# LICENCE: GARGANTEXT.org Licence
RESOURCE_TYPE_HAL = 11
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect \
, HttpResponseForbidden
from gargantext.constants import get_resource, load_crawler, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.util.http import JsonHttpResponse
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
def query( request):
'''get GlobalResults()'''
if request.method == "POST":
query = request.POST["query"]
source = get_resource(RESOURCE_TYPE_HAL)
if source["crawler"] is not None:
crawlerbot = load_crawler(source)()
#old raw way to get results_nb
results = crawlerbot.scan_results(query)
#ids = crawlerbot.get_ids(query)
print(results)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id):
'''save'''
if request.method == "POST":
query = request.POST.get("query")
try:
N = int(request.POST.get("N"))
except:
N = 0
print(query, N)
#for next time
#ids = request.POST["ids"]
source = get_resource(RESOURCE_TYPE_HAL)
if N == 0:
raise Http404()
if N > QUERY_SIZE_N_MAX:
N = QUERY_SIZE_N_MAX
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = session.query( Node ).filter(Node.id == project_id).first()
if project is None:
raise Http404()
user = cache.User[request.user.id]
if not user.owns(project):
return HttpResponseForbidden()
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
}
)
#download_file
crawler_bot = load_crawler(source)()
#for now no way to force downloading X records
#the long running command
filename = crawler_bot.download(query)
corpus.add_resource(
type = source["type"]
#, name = source["name"]
, path = crawler_bot.path
)
session.add(corpus)
session.commit()
#corpus_id = corpus.id
try:
scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
data = [query_string,query,N]
print(data)
return JsonHttpResponse(data)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** MULTIVAC Crawler *****
# ****************************
# LICENCE: GARGANTEXT.org Licence
RESOURCE_TYPE_MULTIVAC = 10
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import get_resource, load_crawler, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.util.http import JsonHttpResponse
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
def query( request):
'''get GlobalResults()'''
if request.method == "POST":
query = request.POST["query"]
source = get_resource(RESOURCE_TYPE_MULTIVAC)
if source["crawler"] is not None:
crawlerbot = load_crawler(source)()
#old raw way to get results_nb
results = crawlerbot.scan_results(query)
#ids = crawlerbot.get_ids(query)
print(results)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id):
'''save'''
if request.method == "POST":
query = request.POST.get("query")
try:
N = int(request.POST.get("N"))
except:
N = 0
print(query, N)
#for next time
#ids = request.POST["ids"]
source = get_resource(RESOURCE_TYPE_MULTIVAC)
if N == 0:
raise Http404()
if N > QUERY_SIZE_N_MAX:
N = QUERY_SIZE_N_MAX
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = session.query( Node ).filter(Node.id == project_id).first()
if project is None:
raise Http404()
user = cache.User[request.user.id]
if not user.owns(project):
return HttpResponseForbidden()
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
, "language_id" : "en"
}
)
#download_file
crawler_bot = load_crawler(source)()
#for now no way to force downloading X records
#the long running command
filename = crawler_bot.download(query)
corpus.add_resource(
type = source["type"]
#, name = source["name"]
, path = crawler_bot.path
)
session.add(corpus)
session.commit()
#corpus_id = corpus.id
try:
scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
data = [query_string,query,N]
print(data)
return JsonHttpResponse(data)
......@@ -18,24 +18,31 @@
from django.conf.urls import url
import moissonneurs.pubmed as pubmed
import moissonneurs.istex as istex
import moissonneurs.cern as cern
import moissonneurs.pubmed as pubmed
import moissonneurs.istex as istex
import moissonneurs.cern as cern
import moissonneurs.multivac as multivac
import moissonneurs.hal as hal
# TODO
#import moissonneurs.hal as hal
#import moissonneurs.revuesOrg as revuesOrg
# TODO ?
# REST API for the moissonneurs
# TODO : ISIDORE
# /!\ urls patterns here are *without* the trailing slash
urlpatterns = [ url(r'^pubmed/query$' , pubmed.query )
, url(r'^pubmed/save/(\d+)' , pubmed.save )
, url(r'^istex/query$' , istex.query )
, url(r'^istex/save/(\d+)' , istex.save )
, url(r'^cern/query$' , cern.query )
, url(r'^cern/save/(\d+)' , cern.save )
urlpatterns = [ url(r'^pubmed/query$' , pubmed.query )
, url(r'^pubmed/save/(\d+)' , pubmed.save )
, url(r'^istex/query$' , istex.query )
, url(r'^istex/save/(\d+)' , istex.save )
, url(r'^cern/query$' , cern.query )
, url(r'^cern/save/(\d+)' , cern.save )
, url(r'^multivac/query$' , multivac.query )
, url(r'^multivac/save/(\d+)' , multivac.save )
, url(r'^hal/query$' , hal.query )
, url(r'^hal/save/(\d+)' , hal.save )
#, url(r'^isidore/query$' , isidore.query )
#, url(r'^isidore/save/(\d+)' , isidore.save )
]
......@@ -183,9 +183,55 @@
</div>
</div>
</div>
{% endif %}
{% if teamPast %}
<div class="panel panel-default">
<div class="panel-heading">
<h2 class="panel-title">
<a data-toggle="collapse" data-parent="#accordion" href="#collapseTeamPast">
<center>
<h2>
<span class="glyphicon glyphicon-question-sign" aria-hidden="true"></span>
Former Developers
<span class="glyphicon glyphicon-question-sign" aria-hidden="true"></span>
</h2>
</center>
</a>
</h2>
</div>
<div id="collapseTeamPast" class="panel-collapse collapse" role="tabpanel">
<div class="panel-body">
<div class="container">
<div class="row">
<div class="thumbnails">
{% for member in teamPast %}
<div class="col-md-5 ">
<div class="thumbnail">
<div class="caption">
<center>
<h3>{{ member.first_name }} {{member.last_name }}</h3>
{% if member.role %}
<p class="description">{{ member.role }}</p>
{% endif %}
</center>
</div>
</div>
</div>
{% endfor %}
</div>
</div>
</div>
</div>
</div>
</div>
{% endif %}
</div>
</div>
<div class="panel panel-default">
<div class="panel-heading">
......
......@@ -367,7 +367,7 @@
<p>
Gargantext
<span class="glyphicon glyphicon-registration-mark" aria-hidden="true"></span>
, version 3.0.6.6,
, version 3.0.6.8,
<a href="http://www.cnrs.fr" target="blank" title="Institution that enables this project.">
Copyrights
<span class="glyphicon glyphicon-copyright-mark" aria-hidden="true"></span>
......
......@@ -86,12 +86,12 @@
<button type="button" class="close" data-dismiss="modal" aria-label="Close">
<span aria-hidden="true">&times;</span>
</button>
<h2 class="modal-title"><h2><span class="glyphicon glyphicon-info-sign" aria-hidden="true"></span> Uploading corpus...</h2>
<h2 class="modal-title"><h2><span class="glyphicon glyphicon-info-sign" aria-hidden="true"></span>Building corpus...</h2>
</div>
<div class="modal-body">
<h5>
Your file has been uploaded !
Gargantext need some time to eat it.
Gargantext is gathering your texts
and need some time to eat it.
Duration depends on the size of the dish.
</h5>
</div>
......
......@@ -209,9 +209,11 @@
function CustomForSelect( selected ) {
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events
selected = selected.toLowerCase()
var is_pubmed = (selected.indexOf('pubmed') != -1);
var is_istex = (selected.indexOf('istex') != -1);
if (is_pubmed || is_istex) {
var is_pubmed = (selected.indexOf('pubmed') != -1);
var is_istex = (selected.indexOf('istex' ) != -1);
var is_repec = (selected.indexOf('repec' ) != -1);
if (is_pubmed || is_istex || is_repec) {
// if(selected=="pubmed") {
console.log("show the button for: " + selected)
$("#pubmedcrawl").css("visibility", "visible");
......
This diff is collapsed.
......@@ -199,12 +199,12 @@
<button type="button" class="close" data-dismiss="modal" aria-label="Close">
<span aria-hidden="true">&times;</span>
</button>
<h2 class="modal-title"><h2><span class="glyphicon glyphicon-info-sign" aria-hidden="true"></span> Uploading corpus...</h2>
<h2 class="modal-title"><h2><span class="glyphicon glyphicon-info-sign" aria-hidden="true"></span>Building the corpus...</h2>
</div>
<div class="modal-body">
<p>
Your file has been uploaded !
Gargantext need some time to eat it.
Gargantext is gathering your texts
and need some time to eat it.
Duration depends on the size of the dish.
</p>
</div>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment