Commit 3aae3103 authored by delanoe's avatar delanoe

[FACTO] Scrapers tools/methods factorization. (TODO: FACTO again).

parent 4a6171e7
......@@ -208,3 +208,9 @@ DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
# about batch processing...
BATCH_PARSING_SIZE = 256
BATCH_NGRAMSEXTRACTION_SIZE = 1024
# Scrapers config
QUERY_SIZE_N_MAX = 1000
QUERY_SIZE_N_DEFAULT = 1000
......@@ -8,9 +8,9 @@ Views are shared between these modules:
- `graph explorer`, to explore graphs
"""
from django.conf.urls import include, url
from django.conf.urls import include, url
from django.contrib import admin
from django.contrib import admin
import gargantext.views.api.urls
import gargantext.views.generated.urls
......@@ -18,32 +18,34 @@ import gargantext.views.pages.urls
# Module Annotation
## tempo: unchanged doc-annotations --
from annotations import urls as annotations_urls
from annotations.views import main as annotations_main_view
from annotations import urls as annotations_urls
from annotations.views import main as annotations_main_view
# Module "Graph Explorer"
#from graphExplorer import urls as graphExplorer_urls
from graphExplorer.rest import Graph
from graphExplorer.rest import Graph
from graphExplorer.views import explorer
from scrapers import urls as scrapers_urls
# Module Scrapers
from scrapers import urls as scrapers_urls
urlpatterns = [ url(r'^admin/', admin.site.urls)
, url(r'^generated/', include(gargantext.views.generated.urls))
, url(r'^api/', include(gargantext.views.api.urls))
, url(r'^', include(gargantext.views.pages.urls))
urlpatterns = [ url(r'^admin/' , admin.site.urls )
, url(r'^generated/' , include( gargantext.views.generated.urls ))
, url(r'^api/' , include( gargantext.views.api.urls ) )
, url(r'^' , include( gargantext.views.pages.urls ) )
# Module Annotation
# tempo: unchanged doc-annotations routes --
, url(r'^annotations/', include(annotations_urls))
, url(r'^annotations/', include( annotations_urls ) )
, url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/$', annotations_main_view)
# Module "Graph Explorer"
, url(r'^projects/(\d+)/corpora/(\d+)/explorer$', explorer)
, url(r'^projects/(\d+)/corpora/(\d+)/graph$', Graph.as_view())
, url(r'^projects/(\d+)/corpora/(\d+)/explorer$', explorer )
, url(r'^projects/(\d+)/corpora/(\d+)/graph$' , Graph.as_view())
# to be removed:
, url(r'^projects/(\d+)/corpora/(\d+)/node_link.json$', Graph.as_view())
, url(r'^projects/(\d+)/corpora/(\d+)/node_link.json$', Graph.as_view())
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer.urls))
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer_urls))
, url(r'^scrapers/', include(scrapers_urls))
# Scrapers module
, url(r'^scrapers/' , include( scrapers_urls ) )
]
def suggest(keywords):
return ['Suggestion #1', 'Suggestion #2', 'Suggestion #3', 'Suggestion #4', 'Suggestion #5']
def count(keywords):
return 42
def query_save(keywords):
return 'path/to/query.xml'
# from datetime import datetime
from time import sleep
import datetime
import threading
#from gargantext.settings import MEDIA_ROOT, BASE_DIR
from django.shortcuts import redirect
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import RESOURCETYPES, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse
from gargantext.util.tools import ensure_dir
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
from scrapers.util import Scraper
def query( request ):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
query = request.POST["query"]
N = int(request.POST["N"])
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
query_string = query.replace(" ","+")
url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks = Scraper()
try:
thedata_path = tasks.download( url )
thedata = open(thedata_path, "rb")
alist = thedata.read().decode('utf-8')
except Exception as error:
alist = [str(error)]
data = alist
return JsonHttpResponse(data)
def save(request , project_id):
print("testISTEX:")
print(request.method)
alist = ["bar","foo"]
# implicit global session
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.typename == 'PROJECT')
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/auth/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST":
query = "-"
query_string = "-"
N = 0
if "query" in request.POST:
query = request.POST["query"]
query_string = query.replace(" ","+") # url encoded q
if "N" in request.POST:
N = int(request.POST["N"]) # query_size from views_opti
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR (scrap: istex d/l ): ",msg)
raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N))
urlreqs = []
pagesize = 50
tasks = Scraper()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scraping data"
, "language_id" : None
}
)
session.add(corpus)
session.commit()
corpus_id = corpus.id
print("NEW CORPUS", corpus_id)
ensure_dir(request.user)
tasks = Scraper()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
tasks.q.put( url ) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
# add the uploaded resource to the corpus
corpus.add_resource( type = 3
, path = filename
)
dwnldsOK+=1
if dwnldsOK == 0 :
return JsonHttpResponse(["fail"])
###########################
###########################
try:
scheduled(parse_extract_indexhyperdata(corpus_id,))
except Exception as error:
print('WORKFLOW ERROR')
print(error)
sleep(1)
return HttpResponseRedirect('/projects/' + str(project_id))
data = [query_string,query,N]
return JsonHttpResponse(data)
# ****************************
# ***** Medline Scraper *****
# ****************************
from scrapers.MedlineFetcher import MedlineFetcher
# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or
# between 9 pm and 5 am Eastern Time weekdays
# from datetime import datetime
......@@ -13,7 +17,7 @@ import threading
from django.shortcuts import redirect
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import RESOURCETYPES
from gargantext.constants import RESOURCETYPES, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse
......@@ -21,22 +25,11 @@ from gargantext.util.tools import ensure_dir
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
from scrapers.util import Scraper
# pour lire la section [scrapers] de gargantext.ini
#from configparser import ConfigParser
# --------------------------------------------------------------------
# importing constants from config file
#CONF = ConfigParser()
#with open(path.join(BASE_DIR, 'gargantext.ini')) as inifile:
# CONF.read_file(inifile)
QUERY_SIZE_N_MAX = 1000 # int(CONF['scrappers']['QUERY_SIZE_N_MAX'])
# QUERY_SIZE_N_DEFAULT = int(CONF['scrappers']['QUERY_SIZE_N_DEFAULT'])
# --------------------------------------------------------------------
def getGlobalStats( request ):
def query( request ):
"""
Pubmed year by year results
......@@ -47,7 +40,7 @@ def getGlobalStats( request ):
# 'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4},
# ... ]
(reused as thequeries in doTheQuery)
(reused as thequeries in query_save)
"""
print(request.method)
alist = []
......@@ -63,7 +56,7 @@ def getGlobalStats( request ):
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
instancia = MedlineFetcher()
instancia = Scraper()
# serialFetcher (n_last_years, query, query_size)
alist = instancia.serialFetcher( 5, query , N )
......@@ -72,7 +65,7 @@ def getGlobalStats( request ):
return JsonHttpResponse(data)
def doTheQuery( request , project_id ) :
def save( request , project_id ) :
# implicit global session
# do we have a valid project id?
try:
......@@ -103,7 +96,7 @@ def doTheQuery( request , project_id ) :
# here we just realize queries already prepared by getGlobalStats
# ===> no need to repeat N parameter like in testISTEX <===
instancia = MedlineFetcher()
instancia = Scraper()
thequeries = json.loads(queries)
# fyi the sum of our prepared yearly proportional quotas
......@@ -138,7 +131,7 @@ def doTheQuery( request , project_id ) :
ensure_dir(request.user)
tasks = MedlineFetcher()
tasks = Scraper()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
......@@ -173,144 +166,3 @@ def doTheQuery( request , project_id ) :
return JsonHttpResponse(data)
def getGlobalStatsISTEXT(request ):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
query = request.POST["query"]
N = int(request.POST["N"])
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
query_string = query.replace(" ","+")
url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks = MedlineFetcher()
try:
thedata_path = tasks.download( url )
thedata = open(thedata_path, "rb")
alist = thedata.read().decode('utf-8')
except Exception as error:
alist = [str(error)]
data = alist
return JsonHttpResponse(data)
def testISTEX(request , project_id):
print("testISTEX:")
print(request.method)
alist = ["bar","foo"]
# implicit global session
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.typename == 'PROJECT')
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/auth/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST":
query = "-"
query_string = "-"
N = 0
if "query" in request.POST:
query = request.POST["query"]
query_string = query.replace(" ","+") # url encoded q
if "N" in request.POST:
N = int(request.POST["N"]) # query_size from views_opti
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR (scrap: istex d/l ): ",msg)
raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N))
urlreqs = []
pagesize = 50
tasks = MedlineFetcher()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scraping data"
, "language_id" : None
}
)
session.add(corpus)
session.commit()
corpus_id = corpus.id
print("NEW CORPUS", corpus_id)
ensure_dir(request.user)
tasks = MedlineFetcher()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
tasks.q.put( url ) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
# add the uploaded resource to the corpus
corpus.add_resource( type = 3
, path = filename
)
dwnldsOK+=1
if dwnldsOK == 0 :
return JsonHttpResponse(["fail"])
###########################
###########################
try:
scheduled(parse_extract_indexhyperdata(corpus_id,))
except Exception as error:
print('WORKFLOW ERROR')
print(error)
sleep(1)
return HttpResponseRedirect('/projects/' + str(project_id))
data = [query_string,query,N]
return JsonHttpResponse(data)
from django.conf.urls import url
import scrapers.pubmed as pubmed
#import scrapers.istex as istex
import scrapers.istex as istex
#import scrapers.cern as cern
#import scrapers.hal as hal
......@@ -11,11 +11,21 @@ import scrapers.pubmed as pubmed
# Available databases : Pubmed, IsTex, (next: CERN)
# /!\ urls patterns here are *without* the trailing slash
urlpatterns = [ url(r'^pubmed/query$' , pubmed.getGlobalStats )
, url(r'^pubmed/search/(\d+)' , pubmed.doTheQuery )
urlpatterns = [ url(r'^pubmed/query$' , pubmed.query )
, url(r'^pubmed/save/(\d+)' , pubmed.save )
, url(r'^istex/query$' , pubmed.getGlobalStatsISTEXT )
, url(r'^istex/search/(\d+)' , pubmed.testISTEX )
#, url(r'^scraping$' , scraping.Target.as_view() )
, url(r'^istex/query$' , istex.query )
, url(r'^istex/save/(\d+)' , istex.save )
# TODO REST API for the scrapers
#, url(r'^rest$' , scraping.Target.as_view() )
,
]
#def count(keywords):
# return 42
#
#def query_save(keywords):
# return 'path/to/query.xml'
#
# ****************************
# ***** Medline Fetcher *****
# ****************************
# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or
# between 9 pm and 5 am Eastern Time weekdays
from gargantext.util.files import download
import sys
if sys.version_info >= (3, 0): from urllib.request import urlopen
else: from urllib import urlopen
import os
import time
# import libxml2
from lxml import etree
import datetime
from django.core.files import File
import codecs
import threading
from queue import Queue
# import time
class MedlineFetcher:
from lxml import etree
if sys.version_info >= (3, 0):
from urllib.request import urlopen
else:
from urllib import urlopen
class Scraper :
def __init__(self):
self.queue_size = 8
......
......@@ -260,7 +260,7 @@
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/scrapers/pubmed/search/"+projectid,
url: window.location.origin+"/scrapers/pubmed/save/"+projectid,
data: pubmedifiedQuery,
type: 'POST',
beforeSend: function(xhr) {
......@@ -504,7 +504,7 @@
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/scrapers/istex/search/"+projectid,
url: window.location.origin+"/scrapers/istex/save/"+projectid,
data: postQuery,
type: 'POST',
beforeSend: function(xhr) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment