[FEAT] Istex scraper ok, need parser now.

f2f0ce75 · delanoe · 9eead9fa · f2f0ce75 · f2f0ce75 · f2f0ce75
Commit f2f0ce75 authored Apr 07, 2016 by delanoe
8 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -160,10 +160,10 @@ RESOURCETYPES = [
        'parser': CSVParser,
        'default_language': 'en',
    },
-    # {   'name': 'ISTex',
-    #     # 'parser': ISTexParser,
-    #     'default_language': 'en',
-    # },
+    {   'name': 'ISTex',
+        'parser': ISTexParser,
+        'default_language': 'en',
+    },
 ]

 # linguistic extraction parameters ---------------------------------------------

--- a/gargantext/util/parsers/ISTex.py
+++ b/gargantext/util/parsers/ISTex.py
@@ -4,7 +4,7 @@ from datetime import datetime
 from io import BytesIO
 import json

-class ISTex(Parser):
+class ISTexParser(Parser):

    def parse(self, thefile):
        json_data=open(thefile,"r")
@@ -84,16 +84,16 @@ class ISTex(Parser):
                # ---------------------------------------------------
                if len(hyperdata["language_iso3"])>0 and hyperdata["language_iso3"][0] != "unknown" :
                    hyperdata["language_iso3"] = hyperdata["language_iso3"][0]
-                
+
                # default value = eng
                # possible even better: langid.classify(abstract)
                else:
                    # NB 97% des docs istex sont eng donc par défaut
                    # ----------------------------------------------
                    hyperdata["language_iso3"] = "eng"
-                    # (cf. api.istex.fr/document/?q=*&facet=language 
+                    # (cf. api.istex.fr/document/?q=*&facet=language
                    #  et  tests langid sur les language=["unknown"])
-                    
+

            if "publication_date" in hyperdata:
                RealDate = hyperdata["publication_date"]

--- a/gargantext/util/parsers/__init__.py
+++ b/gargantext/util/parsers/__init__.py
@@ -7,5 +7,5 @@ from .Pubmed import PubmedParser
 # # 2015-12-08: parser 2 en 1
 from .Europress import EuropressParser

-# from .ISTex import ISTexParser
+from .ISTex import ISTexParser
 from .CSV import CSVParser
--- a/scrapers/MedlineFetcher.py
+++ b/scrapers/MedlineFetcher.py
@@ -142,6 +142,13 @@ class MedlineFetcher:
            self.firstResults.append(result)
            self.q.task_done()

+
+    def chunks(self , l , n):
+        print("chunks:")
+        for i in range(0, len(l), n):
+            yield l[i:i+n]
+
+
    # GLOBALLIMIT:
    # I will retrieve this exact amount of publications.
    # The publications per year i'll retrieve per year will be :

--- a/scrapers/istex.py
+++ b/scrapers/istex.py
-def getGlobalStatsISTEXT(request ):
-    """
-    ISTEX simply the total of hits for a query
-
-    (not reused in testISTEX)
-    """
-    print(request.method)
-    alist = ["bar","foo"]
-
-    if request.method == "POST":
-        query = request.POST["query"]
-        N = int(request.POST["N"])
-        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
-        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
-        query_string = query.replace(" ","+")
-        url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
-
-        tasks = MedlineFetcher()
-
-        try:
-            thedata_path = tasks.download( url )
-            thedata = open(thedata_path, "rb")
-            alist = thedata.read().decode('utf-8')
-        except Exception as error:
-            alist = [str(error)]
-    data = alist
-    return JsonHttpResponse(data)
-
-
-
-
-def testISTEX(request , project_id):
-    print("testISTEX:")
-    print(request.method)
-    alist = ["bar","foo"]
-    # implicit global session
-    # do we have a valid project id?
-    try:
-        project_id = int(project_id)
-    except ValueError:
-        raise Http404()
-
-    # do we have a valid project?
-    project = (session
-        .query(Node)
-        .filter(Node.id == project_id)
-        .filter(Node.typename == 'PROJECT')
-    ).first()
-
-    if project is None:
-        raise Http404()
-
-    # do we have a valid user?
-    user = request.user
-    if not user.is_authenticated():
-        return redirect('/auth/?next=%s' % request.path)
-    if project.user_id != user.id:
-        return HttpResponseForbidden()
-
-
-
-    if request.method == "POST":
-        query = "-"
-        query_string = "-"
-        N = 0
-
-        if "query" in request.POST:
-            query = request.POST["query"]
-            query_string = query.replace(" ","+")   # url encoded q
-
-        if "N" in request.POST:
-            N = int(request.POST["N"])     # query_size from views_opti
-            if N > QUERY_SIZE_N_MAX:
-                msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
-                print("ERROR (scrap: istex d/l ): ",msg)
-                raise ValueError(msg)
-
-        print("Scrapping Istex: '%s' (%i)" % (query_string , N))
-
-        urlreqs = []
-        pagesize = 50
-        tasks = MedlineFetcher()
-        chunks = list(tasks.chunks(range(N), pagesize))
-        for k in chunks:
-            if (k[0]+pagesize)>N: pagesize = N-k[0]
-            urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
-
-
-        resourcetype = RESOURCETYPES["name"]["ISTex"]
-
-        # corpus node instanciation as a Django model
-        corpus = Node(
-            name = query,
-            user_id = request.user.id,
-            parent_id = project_id,
-            typename = 'CORPUS',
-            language_id = None,
-            hyperdata    = {'Processing' : "Parsing documents",}
-        )
-        session.add(corpus)
-        session.commit()
-        corpus_id = corpus.id
-
-        print("NEW CORPUS", corpus_id)
-        ensure_dir(request.user)
-        tasks = MedlineFetcher()
-
-        for i in range(8):
-            t = threading.Thread(target=tasks.worker2) #thing to do
-            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
-            t.start()
-        for url in urlreqs:
-            filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
-            tasks.q.put( [url , filename]) #put a task in th queue
-        tasks.q.join() # wait until everything is finished
-
-        dwnldsOK = 0
-        for filename in tasks.firstResults:
-            if filename!=False:
-                # add the uploaded resource to the corpus
-                corpus.add_resource(corpus,
-                    user_id = request.user.id,
-                    type_id = resourcetype.id,
-                    file = filename,
-                )
-                dwnldsOK+=1
-        if dwnldsOK == 0: return JsonHttpResponse(["fail"])
-        ###########################
-        ###########################
-        try:
-            scheduled(parse_extract_indexhyperdata(corpus_id,))
-        except Exception as error:
-            print('WORKFLOW ERROR')
-            print(error)
-        sleep(1)
-        return HttpResponseRedirect('/project/' + str(project_id))
-
-
-    data = [query_string,query,N]
-    return JsonHttpResponse(data)
-

--- a/scrapers/pubmed.py
+++ b/scrapers/pubmed.py
@@ -72,7 +72,6 @@ def getGlobalStats( request ):
    return JsonHttpResponse(data)


-
 def doTheQuery( request , project_id ) :
    # implicit global session
    # do we have a valid project id?
@@ -174,4 +173,144 @@ def doTheQuery( request , project_id ) :
    return JsonHttpResponse(data)


+def getGlobalStatsISTEXT(request ):
+    """
+    ISTEX simply the total of hits for a query
+
+    (not reused in testISTEX)
+    """
+    print(request.method)
+    alist = ["bar","foo"]
+
+    if request.method == "POST":
+        query = request.POST["query"]
+        N = int(request.POST["N"])
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
+        query_string = query.replace(" ","+")
+        url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
+
+        tasks = MedlineFetcher()
+
+        try:
+            thedata_path = tasks.download( url )
+            thedata = open(thedata_path, "rb")
+            alist = thedata.read().decode('utf-8')
+        except Exception as error:
+            alist = [str(error)]
+    data = alist
+    return JsonHttpResponse(data)
+
+
+def testISTEX(request , project_id):
+    print("testISTEX:")
+    print(request.method)
+    alist = ["bar","foo"]
+    # implicit global session
+    # do we have a valid project id?
+    try:
+        project_id = int(project_id)
+    except ValueError:
+        raise Http404()
+
+    # do we have a valid project?
+    project = (session
+        .query(Node)
+        .filter(Node.id == project_id)
+        .filter(Node.typename == 'PROJECT')
+    ).first()
+
+    if project is None:
+        raise Http404()
+
+    # do we have a valid user?
+    user = request.user
+    if not user.is_authenticated():
+        return redirect('/auth/?next=%s' % request.path)
+    if project.user_id != user.id:
+        return HttpResponseForbidden()
+
+
+    if request.method == "POST":
+        query = "-"
+        query_string = "-"
+        N = 0
+
+        if "query" in request.POST:
+            query = request.POST["query"]
+            query_string = query.replace(" ","+")   # url encoded q
+
+        if "N" in request.POST:
+            N = int(request.POST["N"])     # query_size from views_opti
+            if N > QUERY_SIZE_N_MAX:
+                msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
+                print("ERROR (scrap: istex d/l ): ",msg)
+                raise ValueError(msg)
+
+        print("Scrapping Istex: '%s' (%i)" % (query_string , N))
+
+        urlreqs = []
+        pagesize = 50
+        tasks = MedlineFetcher()
+        chunks = list(tasks.chunks(range(N), pagesize))
+        for k in chunks:
+            if (k[0]+pagesize)>N: pagesize = N-k[0]
+            urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
+
+        # corpus node instanciation as a Django model
+
+        corpus = Node(
+            name = query,
+            user_id = request.user.id,
+            parent_id = project_id,
+            typename = 'CORPUS',
+                        hyperdata    = { "action"        : "Scraping data"
+                                        , "language_id" : None
+                                        }
+        )
+
+
+        session.add(corpus)
+        session.commit()
+        corpus_id = corpus.id
+
+        print("NEW CORPUS", corpus_id)
+        ensure_dir(request.user)
+        tasks = MedlineFetcher()
+
+        for i in range(8):
+            t = threading.Thread(target=tasks.worker2) #thing to do
+            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+            t.start()
+        for url in urlreqs:
+            tasks.q.put( url ) #put a task in th queue
+        tasks.q.join() # wait until everything is finished
+
+        dwnldsOK = 0
+        for filename in tasks.firstResults:
+            if filename!=False:
+                # add the uploaded resource to the corpus
+                # add the uploaded resource to the corpus
+                corpus.add_resource( type = 3
+                                   , path = filename
+                                   )
+                dwnldsOK+=1
+
+        if dwnldsOK == 0 :
+            return JsonHttpResponse(["fail"])
+        ###########################
+        ###########################
+        try:
+            scheduled(parse_extract_indexhyperdata(corpus_id,))
+        except Exception as error:
+            print('WORKFLOW ERROR')
+            print(error)
+        sleep(1)
+        return HttpResponseRedirect('/projects/' + str(project_id))
+
+
+    data = [query_string,query,N]
+    return JsonHttpResponse(data)
+
+

--- a/scrapers/urls.py
+++ b/scrapers/urls.py
 from django.conf.urls import url

 import scrapers.pubmed as pubmed
-#import scrapers.istex as istex
+#import scrapers.istex  as istex
+
 #import scrapers.cern  as cern
 #import scrapers.hal   as hal

@@ -13,8 +14,8 @@ import scrapers.pubmed as pubmed
 urlpatterns = [ url(r'^pubmed/query$'       , pubmed.getGlobalStats            )
              , url(r'^pubmed/search/(\d+)' , pubmed.doTheQuery                )

-#              , url(r'^istex/query$'        , pubmed.getGlobalStatsISTEXT      )
-#              , url(r'^istex/search/(\d+)'  , pubmed.testISTEX                 )
+              , url(r'^istex/query$'        , pubmed.getGlobalStatsISTEXT       )
+              , url(r'^istex/search/(\d+)'  , pubmed.testISTEX                  )
            #, url(r'^scraping$'              , scraping.Target.as_view()      )
              ,
              ]
--- a/templates/pages/projects/project.html
+++ b/templates/pages/projects/project.html
@@ -370,10 +370,10 @@
                                }

                                if(theType=="ISTex") {
-                                    console.log(window.location.origin+"tests/istextquery")
+                                    console.log(window.location.origin+"scrapers/istex/query")
                                    $.ajax({
                                        // contentType: "application/json",
-                                        url: window.location.origin+"/tests/istextquery",
+                                        url: window.location.origin+"/scrapers/istex/query",
                                        data: formData,
                                        type: 'POST',
                                        beforeSend: function(xhr) {
@@ -504,7 +504,7 @@

                                $.ajax({
                                    // contentType: "application/json",
-                                    url: window.location.origin+"/tests/project/"+projectid+"/ISTEXquery/go",
+                                    url: window.location.origin+"/scrapers/istex/search/"+projectid,
                                    data: postQuery,
                                    type: 'POST',
                                    beforeSend: function(xhr) {