[FEAT] pubmed scrapper

d0003ef9 · PkSM3 · ed1311f3 · d0003ef9 · d0003ef9 · d0003ef9
Commit d0003ef9 authored Feb 03, 2015 by PkSM3
10 changed files
--- a/analysis/functions.py
+++ b/analysis/functions.py
@@ -274,7 +274,6 @@ def do_tfidf(corpus, reset=True):
            NodeNodeNgram.objects.filter(nodex=corpus).delete()
        
        if isinstance(corpus, Node) and corpus.type.name == "Corpus":
-            print(Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")))
            for document in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
                for node_ngram in Node_Ngram.objects.filter(node=document):
                    try:

--- a/gargantext_web/urls.py
+++ b/gargantext_web/urls.py
@@ -69,7 +69,8 @@ urlpatterns = patterns('',
    url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments),

    url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats),
-    url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery)
+    url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery),
+    url(r'^tests/project/(\d+)/ISTEXquery/go$', pubmedscrapper.testISTEX)

 )


--- a/gargantext_web/views.py
+++ b/gargantext_web/views.py
@@ -223,8 +223,14 @@ def project(request, project_id):
        corpus_view['count']      = corpus.children.count()

        #just get first element of the corpora and get his type.
-        corpus_type = Node_Resource.objects.filter(node=corpus)[0].resource.type
-        list_corpora[corpus_type].append(corpus_view)
+
+        resource_corpus = Node_Resource.objects.filter(node=corpus)
+        if len(resource_corpus)>0:
+            # print(Node_Resource.objects.filter(node=corpus).all())
+            corpus_type = Node_Resource.objects.filter(node=corpus)[0].resource.type
+            list_corpora[corpus_type].append(corpus_view)
+            donut_part[corpus_type] += docs_count
+        else: print(" Node_Resource = this.corpus(",corpus.pk,") ... nothing, why?")

        ## For avoiding to list repeated elements, like when u use the dynamic query (per each xml, 1)
        # for node_resource in Node_Resource.objects.filter(node=corpus):
@@ -237,6 +243,8 @@ def project(request, project_id):
    if docs_total == 0 or docs_total is None:
        docs_total = 1

+
+    # The donut will show: percentage by  
    donut = [ {'source': key, 
                'count': donut_part[key] , 
                'part' : round(donut_part[key] * 100 / docs_total) } \
@@ -246,12 +254,15 @@ def project(request, project_id):
    if request.method == 'POST':

        form = CustomForm(request.POST, request.FILES)
+
        if form.is_valid():


            name = form.cleaned_data['name']
            thefile = form.cleaned_data['file']
-            resource_type = ResourceType.objects.get(id=str( form.cleaned_data['type'] ))
+            print(request.POST['type'])
+            print(form.cleaned_data['type'])
+            resource_type = ResourceType.objects.get(name=str( form.cleaned_data['type'] ))

            print("-------------")
            print(name,"|",resource_type,"|",thefile)
@@ -326,6 +337,7 @@ def project(request, project_id):
                })
    else:
        form = CustomForm()
+
       
    return render(request, 'project.html', {
            'form'          : form,
@@ -748,9 +760,12 @@ def node_link(request, corpus_id):
    '''
    Create the HttpResponse object with the node_link dataset.
    '''
-
+    import time
    print("In node_link() START")
+    start = time.time()
    data = get_cooc(request=request, corpus_id=corpus_id, type="node_link")
+    end = time.time()
+    print ("LOG::TIME: get_cooc() [s]",(end - start))
    print("In node_link() END")
    return JsonHttpResponse(data)


--- a/node/admin.py
+++ b/node/admin.py
@@ -98,13 +98,10 @@ from django import forms

 from django.utils.translation import ugettext_lazy as _
 class CustomForm(forms.Form):
-    name = forms.CharField( label='Name', max_length=199 , required=True)
-    parsing_options = ResourceType.objects.all().values_list('id', 'name')
-    type = forms.IntegerField( widget=forms.Select( choices= parsing_options) , required=True )
+    name = forms.CharField( label='Name', max_length=199 , widget=forms.TextInput(attrs={ 'required': 'true' }))
+    type = ModelChoiceField( ResourceType.objects.all() , widget=forms.Select(attrs={'onchange':'CustomForSelect( $("option:selected", this).text() );'}) )
    file = forms.FileField()

-
-
    # Description: clean_file()
    """
        * file_.content_type - Example: ['application/pdf', 'image/jpeg']

--- a/node/models.py
+++ b/node/models.py
@@ -163,6 +163,7 @@ class Node(CTENode):
        for node_resource in self.node_resource.filter(parsed=False):
            resource = node_resource.resource
            parser = defaultdict(lambda:FileParser.FileParser, {
+                'istext'    : ISText,
                'pubmed'    : PubmedFileParser,
                'isi'       : IsiFileParser,
                'ris'       : RisFileParser,
@@ -171,6 +172,7 @@ class Node(CTENode):
                'europress_english' : EuropressFileParser,
            })[resource.type.name]()
            metadata_list += parser.parse(str(resource.file))
+            # print(parser.parse(str(resource.file)))
        # retrieve info from the database
        type_id = NodeType.objects.get(name='Document').id
        langages_cache = LanguagesCache()
@@ -183,6 +185,8 @@ class Node(CTENode):
            language = langages_cache[metadata_values['language_iso2']] if 'language_iso2' in metadata_values else None,
            if isinstance(language, tuple):
                language = language[0]
+            # print("metadata_values:")
+            # print("\t",metadata_values,"\n- - - - - - - - - - - - ")
            Node(
                user_id  = user_id,
                type_id  = type_id,
@@ -191,7 +195,6 @@ class Node(CTENode):
                language_id = language.id if language else None,
                metadata = metadata_values
            ).save()
-
        # make metadata filterable
        self.children.all().make_metadata_filterable()

@@ -236,17 +239,32 @@ class Node(CTENode):

    @current_app.task(filter=task_method)
    def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
-        print("In workflow()    parse_resources()")
+        import time
+
+        print("LOG::TIME: In workflow()    parse_resources()")
+        start = time.time()
        self.parse_resources()
-        print("In workflow()    / parse_resources()")
-        print("In workflow()    extract_ngrams()")
+        end = time.time()
+        print ("LOG::TIME: parse_resources() [s]",(end - start))
+        print("LOG::TIME: In workflow()    / parse_resources()")
+
+        start = time.time()
+        print("LOG::TIME: In workflow()    extract_ngrams()")
        type_document   = NodeType.objects.get(name='Document')
        self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
-        print("In workflow()    / extract_ngrams()")
+        end = time.time()
+        print ("LOG::TIME: ",(end - start))
+        print ("LOG::TIME: extract_ngrams() [s]",(end - start))
+        print("LOG::TIME: In workflow()    / extract_ngrams()")
+        
+        start = time.time()
        print("In workflow()    do_tfidf()")
        from analysis.functions import do_tfidf
        do_tfidf(self)
-        print("In workflow()    / do_tfidf()")
+        end = time.time()
+        print ("LOG::TIME: do_tfidf() [s]",(end - start))
+        print("LOG::TIME: In workflow()    / do_tfidf()")
+
        print("In workflow() END")

 class Node_Metadata(models.Model):

--- a/parsing/FileParsers/PubmedFileParser.py
+++ b/parsing/FileParsers/PubmedFileParser.py
@@ -25,6 +25,7 @@ class PubmedFileParser(FileParser):
            metadata_path = {
                "journal"           : 'MedlineCitation/Article/Journal/Title',
                "title"             : 'MedlineCitation/Article/ArticleTitle',
+                "title2"            : 'MedlineCitation/Article/VernacularTitle',
                "language_iso3"     : 'MedlineCitation/Article/Language',
                "doi"               : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
                "realdate_full_"     : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate',
@@ -51,6 +52,13 @@ class PubmedFileParser(FileParser):
                except:
                    pass

+            #Title-Decision
+            Title=""
+            if not metadata["title"] or metadata["title"]=="":
+                if "title2" in metadata:
+                    metadata["title"] = metadata["title2"]
+                else: metadata["title"] = ""
+
            # Date-Decision
            # forge.iscpif.fr/issues/1418
            RealDate = ""
@@ -68,19 +76,25 @@ class PubmedFileParser(FileParser):
            if "publication_month" in metadata: PubmedDate+=" "+metadata["publication_month"]
            if "publication_day" in metadata: PubmedDate+=" "+metadata["publication_day"]

-
+            Decision=""
            if len(RealDate)>4:
-                if len(RealDate)>8: decision = datetime.strptime(RealDate, '%Y %b %d').date()
-                else: decision = datetime.strptime(RealDate, '%Y %b').date()
-            else: decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
+                if len(RealDate)>8:
+                    try: Decision = datetime.strptime(RealDate, '%Y %b %d').date()
+                    except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
+                else: 
+                    try: Decision = datetime.strptime(RealDate, '%Y %b').date()
+                    except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
+            else: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()

-            if "publication_year" in metadata: metadata["publication_year"] = str(decision.year)
-            if "publication_month" in metadata: metadata["publication_month"] = str(decision.month)
-            if "publication_day" in metadata: metadata["publication_day"] = str(decision.day)
+            if "publication_year" in metadata: metadata["publication_year"] = str(Decision.year)
+            if "publication_month" in metadata: metadata["publication_month"] = str(Decision.month)
+            if "publication_day" in metadata: metadata["publication_day"] = str(Decision.day)
            if "realdate_year_" in metadata: metadata.pop("realdate_year_")
            if "realdate_month_" in metadata: metadata.pop("realdate_month_")
            if "realdate_day_" in metadata: metadata.pop("realdate_day_")
+            if "title2" in metadata: metadata.pop("title2")
            
+            # print(metadata)
            metadata_list.append(metadata)
        # return the list of metadata
        return metadata_list
--- a/parsing/FileParsers/__init__.py
+++ b/parsing/FileParsers/__init__.py
@@ -2,3 +2,4 @@ from parsing.FileParsers.RisFileParser import RisFileParser
 from parsing.FileParsers.IsiFileParser import IsiFileParser
 from parsing.FileParsers.PubmedFileParser import PubmedFileParser
 from parsing.FileParsers.EuropressFileParser import EuropressFileParser
+from parsing.FileParsers.ISText import ISText
--- a/scrap_pubmed/MedlineFetcherDavid2015.py
+++ b/scrap_pubmed/MedlineFetcherDavid2015.py
@@ -10,29 +10,36 @@ import os
 import time
 # import libxml2
 from lxml import etree
+from datetime import datetime
+from django.core.files import File
+
+import threading
+from queue import Queue
+import time

 class MedlineFetcher:

    def __init__(self):
+        self.queue_size = 8
+        self.q = Queue()
+        self.firstResults = []
+        self.lock = threading.Lock() # lock to serialize console output
        self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
        self.pubMedDB = 'Pubmed'
        self.reportType = 'medline'
-        self.personalpath_mainPath = 'MedLine/'
-        if not os.path.isdir(self.personalpath_mainPath):
-            os.makedirs(self.personalpath_mainPath)
-            print ('Created directory ' + self.personalpath_mainPath)

-    # Return the:
+
+    # Return the globalResults!:
    # - count = 
    # - queryKey = 
    # - webEnv = 
    def medlineEsearch(self , query):

-        print ("MedlineFetcher::medlineEsearch :")
+        # print ("MedlineFetcher::medlineEsearch :")

        "Get number of results for query 'query' in variable 'count'"
        "Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
-        
+        origQuery = query
        query = query.replace(' ', '%20')
            
        eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' %(self.pubMedEutilsURL, self.pubMedDB, query)
@@ -50,13 +57,7 @@ class MedlineFetcher:
        findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
        webEnv = findwebenv(root)[0]

-        # doc = libxml2.parseDoc(data)
-        # count = doc.xpathEval('eSearchResult/Count/text()')[0]
-        # queryKey = doc.xpathEval('eSearchResult/QueryKey/text()')[0]
-        # webEnv = doc.xpathEval('eSearchResult/WebEnv/text()')[0]
-        # print count, queryKey, webEnv
-        values = { "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv }
-        print(values)
+        values = { "query":origQuery , "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv }
        return values


@@ -72,40 +73,58 @@ class MedlineFetcher:
        queryKey = fullquery["queryKey"]
        webEnv = fullquery["webEnv"]

-        print ("MedlineFetcher::medlineEfetchRAW :")
-
        "Fetch medline result for query 'query', saving results to file every 'retmax' articles"

        queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
        
-
-        # pubmedqueryfolder = personalpath.pubMedAbstractsPath + 'Pubmed_' + queryNoSpace
-        # if not os.path.isdir(pubmedqueryfolder):
-        #     os.makedirs(pubmedqueryfolder)
-
-        pubMedResultFileName = self.personalpath_mainPath + 'Pubmed_' + queryNoSpace + '.xml'
-        pubMedResultFile = open(pubMedResultFileName, 'w')
-        
-
-        print ('Query "' , query , '"\t:\t' , count , ' results')
-        print ('Starting fetching at ' , time.asctime(time.localtime()) )
+        print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')

        retstart = 0
-        # while(retstart < count):
        eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
        return eFetch
-        #     if sys.version_info >= (3, 0): pubMedResultFile.write(eFetchResult.read().decode('utf-8'))
-        #     else: pubMedResultFile.write(eFetchResult.read())
-        #     retstart += retmax
-        #     break # you shall not pass !!
-
-        # pubMedResultFile.close()
-        # print ('Fetching for query ' , query , ' finished at ' , time.asctime(time.localtime()) )
-        # print (retmax , ' results written to file ' , pubMedResultFileName , '\n' )
-        # print("------------------------------------------")
-        # return ["everything","ok"]
-

+    # generic!
+    def downloadFile(self, item):
+        url = item[0]
+        filename = item[1]
+        print("\tin downloadFile:")
+        print(url,filename)
+        data = urlopen(url)
+        f = open(filename, 'w')
+        myfile = File(f)
+        myfile.write( data.read().decode('utf-8') )
+        myfile.close()
+        f.close()
+        with self.lock:
+            print(threading.current_thread().name, filename+" OK")
+            return filename
+
+
+    # generic!
+    def do_work(self,item):
+        # time.sleep(1) # pretend to do some lengthy work.
+        returnvalue = self.medlineEsearch(item)
+        with self.lock:
+            print(threading.current_thread().name, item)
+            return returnvalue
+
+    # The worker thread pulls an item from the queue and processes it
+    def worker(self):
+        while True:
+            item = self.q.get()
+            self.firstResults.append(self.do_work(item))
+            self.q.task_done()
+
+    def worker2(self):
+        while True:
+            item = self.q.get()
+            self.firstResults.append(self.downloadFile(item))
+            self.q.task_done()
+
+    def chunks(self , l , n):
+        print("chunks:")
+        for i in range(0, len(l), n):
+            yield l[i:i+n]

    # GLOBALLIMIT:
    # I will retrieve this exact amount of publications.
@@ -115,22 +134,34 @@ class MedlineFetcher:
    # - GlobalLimit : Number of publications i want.
    def serialFetcher(self , yearsNumber , query, globalLimit):

+        # Create the queue and thread pool.
+        for i in range(self.queue_size):
+             t = threading.Thread(target=self.worker)
+             t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+             t.start()
+        start = time.perf_counter()
+
        N = 0

        print ("MedlineFetcher::serialFetcher :")
        thequeries = []
+        globalresults = []
        for i in range(yearsNumber):
            year = str(2015 - i)
            print ('YEAR ' + year)
            print ('---------\n')
-            # medlineEfetch(str(year) + '[dp] '+query , 20000)
-            # medlineEfetchRAW(str(year) + '[dp] '+query , retmax=300)
            pubmedquery = str(year) + '[dp] '+query
-            globalresults = self.medlineEsearch(pubmedquery)
+            self.q.put( pubmedquery ) #put task in the queue
+        
+        self.q.join()
+        print('time:',time.perf_counter() - start)
+
+        for globalresults in self.firstResults:
+            # globalresults = self.medlineEsearch(pubmedquery)
            if globalresults["count"]>0:
                N+=globalresults["count"]
                querymetadata = { 
-                    "string": pubmedquery , 
+                    "string": globalresults["query"] , 
                    "count": globalresults["count"] , 
                    "queryKey":globalresults["queryKey"] , 
                    "webEnv":globalresults["webEnv"] , 
@@ -149,11 +180,3 @@ class MedlineFetcher:
            query["retmax"] = retmax_forthisyear

        return thequeries
-
-
-
-# serialFetcher(yearsNumber=3, 'microbiota' , globalLimit=100 )
-# query = str(2015)+ '[dp] '+'microbiota'
-# medlineEsearch( query )
-
-# 
--- a/scrap_pubmed/views.py
+++ b/scrap_pubmed/views.py
@@ -14,6 +14,8 @@ import json

 from gargantext_web.settings import MEDIA_ROOT
 from datetime import datetime
+import time
+import threading
 from django.core.files import File
 from gargantext_web.settings import DEBUG

@@ -28,14 +30,16 @@ def getGlobalStats(request ):

 	if request.method == "POST":
 		query = request.POST["query"]
+		print ("LOG::TIME: query =", query )
+		print ("LOG::TIME: N =", 300 )
 		instancia = MedlineFetcher()
-		alist = instancia.serialFetcher( 5, query , 100 )
+		# alist = instancia.serialFetcher( 5, query , int(request.POST["N"]) )
+		alist = instancia.serialFetcher( 5, query , 300 )

 	data = alist
 	return JsonHttpResponse(data)


-from parsing.FileParsers import PubmedFileParser
 def doTheQuery(request , project_id):
 	alist = ["hola","mundo"]

@@ -78,17 +82,20 @@ def doTheQuery(request , project_id):
 		corpus.save()

 		try:
+			tasks = MedlineFetcher()
+			# configuring your queue with the event
+			for i in range(8):
+				t = threading.Thread(target=tasks.worker2) #thing to do
+				t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+				t.start()
 			for url in urlreqs:
-				print(url)
-				data = urlopen(url)
-				xmlname = MEDIA_ROOT + '/corpora/%s/%s.xml' % (request.user, str(datetime.now().microsecond))
-				f = open(xmlname, 'w')
-				myfile = File(f)
-				myfile.write( data.read().decode('utf-8') )
-				myfile.close()
-				f.close()
-				corpus.add_resource( user=request.user, type=resource_type, file=xmlname )
+				filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond))
+				tasks.q.put( [url , filename]) #put a task in th queue
+			tasks.q.join() # wait until everything is finished
+			for filename in tasks.firstResults:
+				corpus.add_resource( user=request.user, type=resource_type, file=filename )

+			# do the WorkFlow
 			try:
 				if DEBUG is True:
 					corpus.workflow()
@@ -96,7 +103,6 @@ def doTheQuery(request , project_id):
 					corpus.workflow.apply_async((), countdown=3)

 				return JsonHttpResponse(["workflow","finished"])
-
 			except Exception as error:
 				print(error)

@@ -106,4 +112,79 @@ def doTheQuery(request , project_id):
 			print("lele",error)

 	data = alist
-	return JsonHttpResponse(data)
\ No newline at end of file
+	return JsonHttpResponse(data)
+
+
+def testISTEX(request , project_id):
+	print(request.method)
+	alist = ["bar","foo"]
+
+
+
+	if request.method == "POST":
+		# print(alist)
+		query = "-"
+		query_string = "-"
+		N = 60
+		if "query" in request.POST: query = request.POST["query"]
+		if "string" in request.POST: query_string = request.POST["string"].replace(" ","+")
+		# if "N" in request.POST: N = request.POST["N"]
+		print(query_string , query , N)
+
+
+		urlreqs = []
+		pagesize = 50
+		tasks = MedlineFetcher()
+		chunks = list(tasks.chunks(range(N), pagesize))
+		for k in chunks:
+			if (k[0]+pagesize)>N: pagesize = N-k[0]
+			urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
+		print(urlreqs)
+
+		# urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
+
+		resource_type = ResourceType.objects.get(name="istext" )
+
+		parent      = Node.objects.get(id=project_id)
+		node_type   = NodeType.objects.get(name='Corpus')
+		type_id = NodeType.objects.get(name='Document').id
+		user_id = User.objects.get( username=request.user ).id
+
+		corpus = Node(
+			user=request.user,
+			parent=parent,
+			type=node_type,
+			name=query,
+		)
+
+		corpus.save()
+
+		# configuring your queue with the event
+		for i in range(8):
+			t = threading.Thread(target=tasks.worker2) #thing to do
+			t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+			t.start()
+		for url in urlreqs:
+			filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond))
+			tasks.q.put( [url , filename]) #put a task in th queue
+		tasks.q.join() # wait until everything is finished
+		for filename in tasks.firstResults:
+			corpus.add_resource( user=request.user, type=resource_type, file=filename )
+
+
+		corpus.save()
+
+		# do the WorkFlow
+		try:
+			if DEBUG is True:
+				corpus.workflow()
+			else:
+				corpus.workflow.apply_async((), countdown=3)
+
+			return JsonHttpResponse(["workflow","finished"])
+		except Exception as error:
+			print(error)
+
+	data = [query_string,query,N]
+	return JsonHttpResponse(data)
+
--- a/templates/project.html
+++ b/templates/project.html