NAMING convention for CRAWLERS

87d24fb0 · c24b · b5004e99 · 87d24fb0 · 87d24fb0 · 87d24fb0
Commit 87d24fb0 authored Jul 27, 2016 by c24b
Showing with 0 additions and 210 deletions

CERN.py gargantext/util/crawlers/CERN.py +0 -0

ISTEX.py gargantext/util/crawlers/ISTEX.py +0 -0

PUBMED.py gargantext/util/crawlers/PUBMED.py +0 -0

util.py.old gargantext/util/crawlers/util.py.old +0 -210

No files found.
--- a/gargantext/util/crawlers/cern.py
+++ b/gargantext/util/crawlers/cern.py
--- a/gargantext/util/crawlers/istex.py
+++ b/gargantext/util/crawlers/istex.py
--- a/gargantext/util/crawlers/pubmed.py
+++ b/gargantext/util/crawlers/pubmed.py
--- a/gargantext/util/crawlers/util.py.old
+++ b/gargantext/util/crawlers/util.py.old
-from gargantext.util.files import download
-import sys
-import time
-import threading
-from queue import Queue
-from lxml import etree
-if sys.version_info >= (3, 0):
-    from urllib.request import urlopen
-else:
-    from urllib import urlopen
-class Scraper :
-    def __init__(self):
-        self.queue_size      = 8
-        self.q               = Queue()
-        self.firstResults    = []
-        self.lock            = threading.Lock() # lock to serialize console output
-        self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
-        self.pubMedDB        = 'Pubmed'
-        self.reportType      = 'medline'
-    # Return the globalResults!:
-    # - count =
-    # - queryKey =
-    # - webEnv =
-    def medlineEsearch(self , query):
-        # print ("MedlineFetcher::medlineEsearch :")
-        "Get number of results for query 'query' in variable 'count'"
-        "Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
-        # print(query)
-        origQuery = query
-        query     = query.replace(' ', '%20')
-        eSearch   = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' \
-                     % ( self.pubMedEutilsURL, self.pubMedDB, query )
-        try:
-            eSearchResult = urlopen(eSearch)
-            data          = eSearchResult.read()
-            root          = etree.XML(data)
-            findcount     = etree.XPath("/eSearchResult/Count/text()")
-            count         = findcount(root)[0]
-            findquerykey  = etree.XPath("/eSearchResult/QueryKey/text()")
-            queryKey      = findquerykey(root)[0]
-            findwebenv    = etree.XPath("/eSearchResult/WebEnv/text()")
-            webEnv        = findwebenv(root)[0]
-        except Exception as Error:
-            print(Error)
-            count         = 0
-            queryKey      = False
-            webEnv        = False
-            origQuery     = False
-        values = { "query"    : origQuery
-                 , "count"    : int(count)
-                 , "queryKey" : queryKey
-                 , "webEnv"   : webEnv
-                 }
-        return values
-    # RETMAX:
-    # Total number of UIDs from the retrieved set to be shown in the XML output (default=20)
-    # maximum of 100,000 records
-    def medlineEfetchRAW( self , fullquery):
-        query    = fullquery [ "string"  ]
-        retmax   = fullquery [ "retmax"  ]
-        count    = fullquery [ "count"   ]
-        queryKey = fullquery [ "queryKey"]
-        webEnv   = fullquery [ "webEnv"  ]
-        "Fetch medline result for query 'query', saving results to file every 'retmax' articles"
-        queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
-        # print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
-        retstart = 0
-        eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
-        return eFetch
-    # generic!
-    def download(self, url):
-        print(url)
-        filename = download(url)
-        with self.lock:
-            print(threading.current_thread().name, filename+" OK")
-            return filename
-    # generic!
-    def do_work(self,item):
-        # time.sleep(1) # pretend to do some lengthy work.
-        returnvalue = self.medlineEsearch(item)
-        with self.lock:
-            # print(threading.current_thread().name, item)
-            return returnvalue
-    # The worker thread pulls an item from the queue and processes it
-    def worker(self):
-        while True:
-            item = self.q.get()
-            self.firstResults.append(self.do_work(item))
-            self.q.task_done()
-    def worker2(self):
-        while True:
-            item = self.q.get()
-            results = []
-            try:
-                result = self.download(item)
-            except Exception as error :
-                print(error)
-                result = False
-            self.firstResults.append(result)
-            self.q.task_done()
-    def chunks(self , l , n):
-        print("chunks:")
-        for i in range(0, len(l), n):
-            yield l[i:i+n]
-    # GLOBALLIMIT:
-    # I will retrieve this exact amount of publications.
-    # The publications per year i'll retrieve per year will be :
-    #        (k/N)*GlobalLimit
-    #                  \_ this is used as RETMAX
-    # - k : Number of publications of x year (according to pubmed)
-    # - N : Sum of every k belonging to {X} (total number of pubs according to pubmed)
-    # - GlobalLimit : Number of publications i want.
-    def serialFetcher(self , yearsNumber , query, globalLimit):
-        # Create the queue and thread pool.
-        for i in range(self.queue_size):
-             t = threading.Thread(target=self.worker)
-             t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
-             t.start()
-        start = time.perf_counter()
-        N = 0
-        # print ("MedlineFetcher::serialFetcher :")
-        thequeries = []
-        globalresults = []
-        for i in range(yearsNumber):
-            year = str(2015 - i)
-            # print ('YEAR ' + year)
-            # print ('---------\n')
-            pubmedquery = str(year) + '[dp] '+query
-            self.q.put( pubmedquery ) #put task in the queue
-        self.q.join()
-        print('time:',time.perf_counter() - start)
-        Total = 0
-        Fails = 0
-        for globalresults in self.firstResults:
-            # globalresults = self.medlineEsearch(pubmedquery)
-            Total += 1
-            if globalresults["queryKey"]==False:
-                Fails += 1
-            if globalresults["count"] > 0 :
-                N+=globalresults["count"]
-                queryhyperdata = { "string"   : globalresults["query"]
-                                 , "count"    : globalresults["count"]
-                                 , "queryKey" : globalresults["queryKey"]
-                                 , "webEnv"   : globalresults["webEnv"]
-                                 , "retmax"   : 0
-                                 }
-                thequeries.append ( queryhyperdata )
-        print("Total Number:", N,"publications")
-        print("And i want just:",globalLimit,"publications")
-        print("---------------------------------------\n")
-        for i,query in enumerate(thequeries):
-            k                  = query["count"]
-            proportion         = k/float(N)
-            retmax_forthisyear = int(round(globalLimit*proportion))
-            query["retmax"]    = retmax_forthisyear
-            if query["retmax"] == 0 : query["retmax"]+=1
-            print(query["string"],"\t[",k,">",query["retmax"],"]")
-        if ((Fails+1)/(Total+1)) == 1 : # for identifying the epic fail or connection error
-            thequeries = [False]
-        return thequeries