Commit 3aae3103 authored by delanoe's avatar delanoe

[FACTO] Scrapers tools/methods factorization. (TODO: FACTO again).

parent 4a6171e7
...@@ -208,3 +208,9 @@ DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY ...@@ -208,3 +208,9 @@ DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
# about batch processing... # about batch processing...
BATCH_PARSING_SIZE = 256 BATCH_PARSING_SIZE = 256
BATCH_NGRAMSEXTRACTION_SIZE = 1024 BATCH_NGRAMSEXTRACTION_SIZE = 1024
# Scrapers config
QUERY_SIZE_N_MAX = 1000
QUERY_SIZE_N_DEFAULT = 1000
...@@ -8,9 +8,9 @@ Views are shared between these modules: ...@@ -8,9 +8,9 @@ Views are shared between these modules:
- `graph explorer`, to explore graphs - `graph explorer`, to explore graphs
""" """
from django.conf.urls import include, url from django.conf.urls import include, url
from django.contrib import admin from django.contrib import admin
import gargantext.views.api.urls import gargantext.views.api.urls
import gargantext.views.generated.urls import gargantext.views.generated.urls
...@@ -18,32 +18,34 @@ import gargantext.views.pages.urls ...@@ -18,32 +18,34 @@ import gargantext.views.pages.urls
# Module Annotation # Module Annotation
## tempo: unchanged doc-annotations -- ## tempo: unchanged doc-annotations --
from annotations import urls as annotations_urls from annotations import urls as annotations_urls
from annotations.views import main as annotations_main_view from annotations.views import main as annotations_main_view
# Module "Graph Explorer" # Module "Graph Explorer"
#from graphExplorer import urls as graphExplorer_urls #from graphExplorer import urls as graphExplorer_urls
from graphExplorer.rest import Graph from graphExplorer.rest import Graph
from graphExplorer.views import explorer from graphExplorer.views import explorer
from scrapers import urls as scrapers_urls # Module Scrapers
from scrapers import urls as scrapers_urls
urlpatterns = [ url(r'^admin/', admin.site.urls) urlpatterns = [ url(r'^admin/' , admin.site.urls )
, url(r'^generated/', include(gargantext.views.generated.urls)) , url(r'^generated/' , include( gargantext.views.generated.urls ))
, url(r'^api/', include(gargantext.views.api.urls)) , url(r'^api/' , include( gargantext.views.api.urls ) )
, url(r'^', include(gargantext.views.pages.urls)) , url(r'^' , include( gargantext.views.pages.urls ) )
# Module Annotation # Module Annotation
# tempo: unchanged doc-annotations routes -- # tempo: unchanged doc-annotations routes --
, url(r'^annotations/', include(annotations_urls)) , url(r'^annotations/', include( annotations_urls ) )
, url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/$', annotations_main_view) , url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/$', annotations_main_view)
# Module "Graph Explorer" # Module "Graph Explorer"
, url(r'^projects/(\d+)/corpora/(\d+)/explorer$', explorer) , url(r'^projects/(\d+)/corpora/(\d+)/explorer$', explorer )
, url(r'^projects/(\d+)/corpora/(\d+)/graph$', Graph.as_view()) , url(r'^projects/(\d+)/corpora/(\d+)/graph$' , Graph.as_view())
# to be removed: # to be removed:
, url(r'^projects/(\d+)/corpora/(\d+)/node_link.json$', Graph.as_view()) , url(r'^projects/(\d+)/corpora/(\d+)/node_link.json$', Graph.as_view())
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer.urls)) #url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer.urls))
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer_urls))
, url(r'^scrapers/', include(scrapers_urls)) # Scrapers module
, url(r'^scrapers/' , include( scrapers_urls ) )
] ]
def suggest(keywords):
return ['Suggestion #1', 'Suggestion #2', 'Suggestion #3', 'Suggestion #4', 'Suggestion #5']
def count(keywords):
return 42
def query_save(keywords):
return 'path/to/query.xml'
# from datetime import datetime
from time import sleep
import datetime
import threading
#from gargantext.settings import MEDIA_ROOT, BASE_DIR
from django.shortcuts import redirect
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import RESOURCETYPES, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse
from gargantext.util.tools import ensure_dir
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
from scrapers.util import Scraper
def query( request ):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
query = request.POST["query"]
N = int(request.POST["N"])
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
query_string = query.replace(" ","+")
url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks = Scraper()
try:
thedata_path = tasks.download( url )
thedata = open(thedata_path, "rb")
alist = thedata.read().decode('utf-8')
except Exception as error:
alist = [str(error)]
data = alist
return JsonHttpResponse(data)
def save(request , project_id):
print("testISTEX:")
print(request.method)
alist = ["bar","foo"]
# implicit global session
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.typename == 'PROJECT')
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/auth/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST":
query = "-"
query_string = "-"
N = 0
if "query" in request.POST:
query = request.POST["query"]
query_string = query.replace(" ","+") # url encoded q
if "N" in request.POST:
N = int(request.POST["N"]) # query_size from views_opti
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR (scrap: istex d/l ): ",msg)
raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N))
urlreqs = []
pagesize = 50
tasks = Scraper()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scraping data"
, "language_id" : None
}
)
session.add(corpus)
session.commit()
corpus_id = corpus.id
print("NEW CORPUS", corpus_id)
ensure_dir(request.user)
tasks = Scraper()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
tasks.q.put( url ) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
# add the uploaded resource to the corpus
corpus.add_resource( type = 3
, path = filename
)
dwnldsOK+=1
if dwnldsOK == 0 :
return JsonHttpResponse(["fail"])
###########################
###########################
try:
scheduled(parse_extract_indexhyperdata(corpus_id,))
except Exception as error:
print('WORKFLOW ERROR')
print(error)
sleep(1)
return HttpResponseRedirect('/projects/' + str(project_id))
data = [query_string,query,N]
return JsonHttpResponse(data)
# ****************************
# ***** Medline Scraper *****
# ****************************
from scrapers.MedlineFetcher import MedlineFetcher # MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or
# between 9 pm and 5 am Eastern Time weekdays
# from datetime import datetime # from datetime import datetime
...@@ -13,7 +17,7 @@ import threading ...@@ -13,7 +17,7 @@ import threading
from django.shortcuts import redirect from django.shortcuts import redirect
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import RESOURCETYPES from gargantext.constants import RESOURCETYPES, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node from gargantext.models.nodes import Node
from gargantext.util.db import session from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse from gargantext.util.http import JsonHttpResponse
...@@ -21,22 +25,11 @@ from gargantext.util.tools import ensure_dir ...@@ -21,22 +25,11 @@ from gargantext.util.tools import ensure_dir
from gargantext.util.scheduling import scheduled from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata from gargantext.util.toolchain import parse_extract_indexhyperdata
from scrapers.util import Scraper
# pour lire la section [scrapers] de gargantext.ini
#from configparser import ConfigParser
# -------------------------------------------------------------------- def query( request ):
# importing constants from config file
#CONF = ConfigParser()
#with open(path.join(BASE_DIR, 'gargantext.ini')) as inifile:
# CONF.read_file(inifile)
QUERY_SIZE_N_MAX = 1000 # int(CONF['scrappers']['QUERY_SIZE_N_MAX'])
# QUERY_SIZE_N_DEFAULT = int(CONF['scrappers']['QUERY_SIZE_N_DEFAULT'])
# --------------------------------------------------------------------
def getGlobalStats( request ):
""" """
Pubmed year by year results Pubmed year by year results
...@@ -47,7 +40,7 @@ def getGlobalStats( request ): ...@@ -47,7 +40,7 @@ def getGlobalStats( request ):
# 'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4}, # 'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4},
# ... ] # ... ]
(reused as thequeries in doTheQuery) (reused as thequeries in query_save)
""" """
print(request.method) print(request.method)
alist = [] alist = []
...@@ -63,7 +56,7 @@ def getGlobalStats( request ): ...@@ -63,7 +56,7 @@ def getGlobalStats( request ):
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query ) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N ) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
instancia = MedlineFetcher() instancia = Scraper()
# serialFetcher (n_last_years, query, query_size) # serialFetcher (n_last_years, query, query_size)
alist = instancia.serialFetcher( 5, query , N ) alist = instancia.serialFetcher( 5, query , N )
...@@ -72,7 +65,7 @@ def getGlobalStats( request ): ...@@ -72,7 +65,7 @@ def getGlobalStats( request ):
return JsonHttpResponse(data) return JsonHttpResponse(data)
def doTheQuery( request , project_id ) : def save( request , project_id ) :
# implicit global session # implicit global session
# do we have a valid project id? # do we have a valid project id?
try: try:
...@@ -103,7 +96,7 @@ def doTheQuery( request , project_id ) : ...@@ -103,7 +96,7 @@ def doTheQuery( request , project_id ) :
# here we just realize queries already prepared by getGlobalStats # here we just realize queries already prepared by getGlobalStats
# ===> no need to repeat N parameter like in testISTEX <=== # ===> no need to repeat N parameter like in testISTEX <===
instancia = MedlineFetcher() instancia = Scraper()
thequeries = json.loads(queries) thequeries = json.loads(queries)
# fyi the sum of our prepared yearly proportional quotas # fyi the sum of our prepared yearly proportional quotas
...@@ -138,7 +131,7 @@ def doTheQuery( request , project_id ) : ...@@ -138,7 +131,7 @@ def doTheQuery( request , project_id ) :
ensure_dir(request.user) ensure_dir(request.user)
tasks = MedlineFetcher() tasks = Scraper()
for i in range(8): for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do t = threading.Thread(target=tasks.worker2) #thing to do
...@@ -173,144 +166,3 @@ def doTheQuery( request , project_id ) : ...@@ -173,144 +166,3 @@ def doTheQuery( request , project_id ) :
return JsonHttpResponse(data) return JsonHttpResponse(data)
def getGlobalStatsISTEXT(request ):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
query = request.POST["query"]
N = int(request.POST["N"])
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
query_string = query.replace(" ","+")
url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks = MedlineFetcher()
try:
thedata_path = tasks.download( url )
thedata = open(thedata_path, "rb")
alist = thedata.read().decode('utf-8')
except Exception as error:
alist = [str(error)]
data = alist
return JsonHttpResponse(data)
def testISTEX(request , project_id):
print("testISTEX:")
print(request.method)
alist = ["bar","foo"]
# implicit global session
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.typename == 'PROJECT')
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/auth/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST":
query = "-"
query_string = "-"
N = 0
if "query" in request.POST:
query = request.POST["query"]
query_string = query.replace(" ","+") # url encoded q
if "N" in request.POST:
N = int(request.POST["N"]) # query_size from views_opti
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR (scrap: istex d/l ): ",msg)
raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N))
urlreqs = []
pagesize = 50
tasks = MedlineFetcher()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scraping data"
, "language_id" : None
}
)
session.add(corpus)
session.commit()
corpus_id = corpus.id
print("NEW CORPUS", corpus_id)
ensure_dir(request.user)
tasks = MedlineFetcher()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
tasks.q.put( url ) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
# add the uploaded resource to the corpus
corpus.add_resource( type = 3
, path = filename
)
dwnldsOK+=1
if dwnldsOK == 0 :
return JsonHttpResponse(["fail"])
###########################
###########################
try:
scheduled(parse_extract_indexhyperdata(corpus_id,))
except Exception as error:
print('WORKFLOW ERROR')
print(error)
sleep(1)
return HttpResponseRedirect('/projects/' + str(project_id))
data = [query_string,query,N]
return JsonHttpResponse(data)
from django.conf.urls import url from django.conf.urls import url
import scrapers.pubmed as pubmed import scrapers.pubmed as pubmed
#import scrapers.istex as istex import scrapers.istex as istex
#import scrapers.cern as cern #import scrapers.cern as cern
#import scrapers.hal as hal #import scrapers.hal as hal
...@@ -11,11 +11,21 @@ import scrapers.pubmed as pubmed ...@@ -11,11 +11,21 @@ import scrapers.pubmed as pubmed
# Available databases : Pubmed, IsTex, (next: CERN) # Available databases : Pubmed, IsTex, (next: CERN)
# /!\ urls patterns here are *without* the trailing slash # /!\ urls patterns here are *without* the trailing slash
urlpatterns = [ url(r'^pubmed/query$' , pubmed.getGlobalStats ) urlpatterns = [ url(r'^pubmed/query$' , pubmed.query )
, url(r'^pubmed/search/(\d+)' , pubmed.doTheQuery ) , url(r'^pubmed/save/(\d+)' , pubmed.save )
, url(r'^istex/query$' , pubmed.getGlobalStatsISTEXT ) , url(r'^istex/query$' , istex.query )
, url(r'^istex/search/(\d+)' , pubmed.testISTEX ) , url(r'^istex/save/(\d+)' , istex.save )
#, url(r'^scraping$' , scraping.Target.as_view() )
# TODO REST API for the scrapers
#, url(r'^rest$' , scraping.Target.as_view() )
, ,
] ]
#def count(keywords):
# return 42
#
#def query_save(keywords):
# return 'path/to/query.xml'
#
# ****************************
# ***** Medline Fetcher *****
# ****************************
# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or
# between 9 pm and 5 am Eastern Time weekdays
from gargantext.util.files import download from gargantext.util.files import download
import sys import sys
if sys.version_info >= (3, 0): from urllib.request import urlopen
else: from urllib import urlopen
import os
import time import time
# import libxml2
from lxml import etree
import datetime
from django.core.files import File
import codecs
import threading import threading
from queue import Queue from queue import Queue
# import time
class MedlineFetcher: from lxml import etree
if sys.version_info >= (3, 0):
from urllib.request import urlopen
else:
from urllib import urlopen
class Scraper :
def __init__(self): def __init__(self):
self.queue_size = 8 self.queue_size = 8
......
...@@ -260,7 +260,7 @@ ...@@ -260,7 +260,7 @@
$.ajax({ $.ajax({
// contentType: "application/json", // contentType: "application/json",
url: window.location.origin+"/scrapers/pubmed/search/"+projectid, url: window.location.origin+"/scrapers/pubmed/save/"+projectid,
data: pubmedifiedQuery, data: pubmedifiedQuery,
type: 'POST', type: 'POST',
beforeSend: function(xhr) { beforeSend: function(xhr) {
...@@ -504,7 +504,7 @@ ...@@ -504,7 +504,7 @@
$.ajax({ $.ajax({
// contentType: "application/json", // contentType: "application/json",
url: window.location.origin+"/scrapers/istex/search/"+projectid, url: window.location.origin+"/scrapers/istex/save/"+projectid,
data: postQuery, data: postQuery,
type: 'POST', type: 'POST',
beforeSend: function(xhr) { beforeSend: function(xhr) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment