PARSER and CRAWLER changed with NEW NAMING CONVENTION

82341b15 · c24b · 6c5d252b · 82341b15 · 82341b15 · 82341b15
Commit 82341b15 authored Jul 27, 2016 by c24b
12 changed files
--- a/gargantext/util/crawlers/_Crawler.py
+++ b/gargantext/util/crawlers/_Crawler.py
+# Scrapers config
+QUERY_SIZE_N_MAX     = 1000
+
+from gargantext.constants import get_resource
+from gargantext.util.scheduling import scheduled
+from gargantext.util.db         import session
+from requests_futures.sessions import FuturesSession
+from gargantext.util.db         import session
+import requests
+from gargantext.models.nodes    import Node
+#from gargantext.util.toolchain import parse_extract_indexhyperdata
+from datetime import date
+
+class Crawler:
+    """Base class for performing search and add corpus file depending on the type
+    """
+    def __init__(self, record):
+
+        #the name of corpus
+        #that will be built in case of internal fileparsing
+        self.record = record
+        self.name = record["corpus_name"]
+        self.project_id = record["project_id"]
+        self.user_id = record["user_id"]
+        self.resource = record["source"]
+        self.type = get_resource(self.resource)
+        self.query = record["query"]
+        #format the sampling
+        self.n_last_years = 5
+        self.YEAR = date.today().year
+        #pas glop
+        # mais easy version
+        self.MONTH = str(date.today().month)
+        if len(self.MONTH) == 1:
+            self.MONTH = "0"+self.MONTH
+        self.MAX_RESULTS = 1000
+        try:
+            self.results_nb = int(record["count"])
+        except KeyError:
+            #n'existe pas encore
+            self.results_nb = 0
+        try:
+            self.webEnv = record["webEnv"]
+            self.queryKey = record["queryKey"]
+            self.retMax = record["retMax"]
+        except KeyError:
+            #n'exsite pas encore
+            self.queryKey = None
+            self.webEnv = None
+            self.retMax = 1
+        self.status = [None]
+        self.path = "/tmp/results.txt"
+
+    def tmp_file(self):
+        '''here should stored the results
+        depending on the type of format'''
+        raise NotImplemented
+
+
+    def parse_query(self):
+        '''here should parse the parameters of the query
+        depending on the type and retrieve a set of activated search option
+        '''
+        raise NotImplemented
+
+    def fetch(self):
+        if self.download():
+            self.create_corpus()
+            return self.corpus_id
+    def get_sampling_dates():
+        '''Create a sample list of min and max date based on Y and M f*
+        or N_LAST_YEARS results'''
+        dates = []
+        for i in range(self.n_last_years):
+            maxyear = self.YEAR -i
+            mindate = str(maxyear-1)+"/"+str(self.MONTH)
+            maxdate = str(maxyear)+"/"+str(self.MONTH)
+            print(mindate,"-",maxdate)
+            dates.append((mindate, maxdate))
+        return dates
+
+    def create_corpus(self):
+        #create a corpus
+        corpus = Node(
+            name = self.query,
+            user_id = self.user_id,
+            parent_id = self.project_id,
+            typename = 'CORPUS',
+                        hyperdata    = { "action"        : "Scrapping data",
+                                         "language_id" : self.type["default_language"],
+                                        }
+        )
+        self.corpus_id = corpus.id
+        if len(self.paths) > 0:
+            for path in self.paths:
+                #add the resource
+                corpus.add_resource(
+                  type = self.type["type"],
+                  name = self.type["name"],
+                  path = path
+                  )
+            session.add(corpus)
+            session.commit()
+            scheduled(parse_extract_indexhyperdata(corpus.id))
+        else:
+            #add the resource
+            corpus.add_resource(
+              type = self.type["type"],
+              name = self.type["name"],
+              path = self.path
+              )
+            session.add(corpus)
+            session.commit()
+            scheduled(parse_extract_indexhyperdata(corpus.id))
+        return corpus
--- a/gargantext/util/crawlers/cern.py
+++ b/gargantext/util/crawlers/cern.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# ****************************
+# *****  CERN Scrapper    *****
+# ****************************
+# Author:c24b
+# Date: 27/05/2015
+
+from ._Crawler import Crawler
+
+import hmac, hashlib
+import requests
+import os
+import random
+import urllib.parse as uparse
+from lxml import etree
+from gargantext.settings import API_TOKENS
+
+#from gargantext.util.files import build_corpus_path
+from gargantext.util.db import session
+from gargantext.models          import Node
+
+class CernCrawler(Crawler):
+    '''CERN SCOAP3 API Interaction'''
+
+    def __generate_signature__(self, url):
+        '''creation de la signature'''
+        #hmac-sha1 salted with secret
+        return hmac.new(self.secret,url, hashlib.sha1).hexdigest()
+
+    def __format_query__(self, query, of="xm", fields= None):
+        ''' for query filters params
+        see doc https://scoap3.org/scoap3-repository/xml-api/
+        '''
+        #dict_q = uparse.parse_qs(query)
+        dict_q = {}
+        #by default: search by pattern
+        dict_q["p"] = query
+        if fields is not None and isinstance(fields, list):
+            fields = ",".join(fields)
+            dict_q["f"] = fields
+        #outputformat: "xm", "xmt", "h", "html"
+        dict_q["of"]= of
+        return dict_q
+
+    def __format_url__(self, dict_q):
+        '''format the url with encoded query'''
+        #add the apikey
+        dict_q["apikey"] = [self.apikey]
+        params = "&".join([(str(k)+"="+str(uparse.quote(v[0]))) for k,v in sorted(dict_q.items())])
+        return self.BASE_URL+params
+
+    def sign_url(self, dict_q):
+        '''add signature'''
+        API = API_TOKENS["CERN"]
+        self.apikey = API["APIKEY"]
+        self.secret  = API["APISECRET"].encode("utf-8")
+        self.BASE_URL = u"http://api.scoap3.org/search?"
+        url = self.__format_url__(dict_q)
+        return url+"&signature="+self.__generate_signature__(url.encode("utf-8"))
+
+
+    def create_corpus(self):
+        #create a corpus
+        corpus = Node(
+            name = self.query,
+            #user_id = self.user_id,
+            parent_id = self.project_id,
+            typename = 'CORPUS',
+                        hyperdata    = { "action"        : "Scrapping data"
+                                        , "language_id" : self.type["default_language"]
+                                        }
+        )
+        #add the resource
+        corpus.add_resource(
+          type = self.type["type"],
+          name = self.type["name"],
+          path = self.path)
+
+        try:
+            print("PARSING")
+            # p = eval(self.type["parser"])()
+            session.add(corpus)
+            session.commit()
+            self.corpus_id = corpus.id
+            parse_extract_indexhyperdata(corpus.id)
+            return self
+        except Exception as error:
+            print('WORKFLOW ERROR')
+            print(error)
+            session.rollback()
+            return self
+
+    def download(self):
+        import time
+        self.path = "/tmp/results.xml"
+        query = self.__format_query__(self.query)
+        url = self.sign_url(query)
+        start = time.time()
+        r = requests.get(url, stream=True)
+        downloaded = False
+        #the long part
+        with open(self.path, 'wb') as f:
+            print("Downloading file")
+            for chunk in r.iter_content(chunk_size=1024):
+
+                if chunk: # filter out keep-alive new chunks
+                    #print("===")
+                    f.write(chunk)
+            downloaded = True
+            end = time.time()
+            #print (">>>>>>>>>>LOAD results", end-start)
+        return downloaded
+
+
+    def scan_results(self):
+        '''scanner le nombre de resultat en récupérant 1 seul résultat
+        qui affiche uniquement l'auteur de la page 1
+        on récupère le commentaire en haut de la page
+        '''
+        import time
+
+
+        self.results_nb = 0
+        query = self.__format_query__(self.query, of="hb")
+        query["ot"] = "100"
+        query["jrec"]='1'
+        query["rg"]='1'
+        url = self.sign_url(query)
+        print(url)
+        #start = time.time()
+        r = requests.get(url)
+        #end = time.time()
+        #print (">>>>>>>>>>LOAD results_nb", end-start)
+        if r.status_code == 200:
+            self.results_nb = int(r.text.split("-->")[0].split(': ')[-1][:-1])
+            return self.results_nb
+        else:
+            raise ValueError(r.status)
--- a/gargantext/util/crawlers/istex.py
+++ b/gargantext/util/crawlers/istex.py
+from ._Crawler import *
+import json
+
+class ISTexCrawler(Crawler):
+    """
+    ISTEX Crawler
+    """
+    def __format_query__(self,query=None):
+        '''formating query urlquote instead'''
+        if query is not None:
+            query = query.replace(" ","+")
+            return query
+        else:
+            self.query = self.query.replace(" ","+")
+            return self.query
+
+    def scan_results(self):
+        #get the number of results
+        self.results_nb = 0
+        self.query = self.__format_query__()
+        _url = "http://api.istex.fr/document/?q="+self.query+"&size=0"
+        #"&output=id,title,abstract,pubdate,corpusName,authors,language"
+        r = requests.get(_url)
+        print(r)
+        if r.status_code == 200:
+            self.results_nb = int(r.json()["total"])
+            self.status.append("fetching results")
+            return self.results_nb
+        else:
+            self.status.append("error")
+            raise ValueError(r.status)
+
+    def download(self):
+        '''fetching items'''
+        downloaded = False
+        def get_hits(future):
+            '''here we directly get the result hits'''
+            response = future.result()
+            if response.status_code == 200:
+                return response.json()["hits"]
+            else:
+                return None
+
+        #session = FuturesSession()
+        #self.path = "/tmp/results.json"
+        self.status.append("fetching results")
+        paging = 100
+        self.query_max = self.results_nb
+        if self.query_max > QUERY_SIZE_N_MAX:
+            msg = "Invalid sample size N = %i (max = %i)" % (self.query_max, QUERY_SIZE_N_MAX)
+            print("ERROR (scrap: istex d/l ): ",msg)
+            self.query_max = QUERY_SIZE_N_MAX
+
+        #urlreqs = []
+        with open(self.path, 'wb') as f:
+            for i in range(0, self.query_max, paging):
+                url_base = "http://api.istex.fr/document/?q="+self.query+"&output=*&from=%i&size=%i" %(i, paging)
+                r = requests.get(url_base)
+                if r.status_code == 200:
+                    downloaded = True
+                    f.write(r.text.encode("utf-8"))
+                else:
+                    downloaded = False
+                    self.status.insert(0, "error fetching ISTEX "+ r.status)
+                    break
+        return downloaded
+
+
+
+
--- a/gargantext/util/crawlers/pubmed.py
+++ b/gargantext/util/crawlers/pubmed.py
--- a/gargantext/util/crawlers/util.py.old
+++ b/gargantext/util/crawlers/util.py.old
+
+from gargantext.util.files import download
+
+import sys
+import time
+import threading
+from queue import Queue
+
+from lxml import etree
+if sys.version_info >= (3, 0):
+    from urllib.request import urlopen
+else:
+    from urllib import urlopen
+
+
+class Scraper :
+
+    def __init__(self):
+        self.queue_size      = 8
+        self.q               = Queue()
+        self.firstResults    = []
+        self.lock            = threading.Lock() # lock to serialize console output
+        self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
+        self.pubMedDB        = 'Pubmed'
+        self.reportType      = 'medline'
+
+
+    # Return the globalResults!:
+    # - count =
+    # - queryKey =
+    # - webEnv =
+    def medlineEsearch(self , query):
+
+        # print ("MedlineFetcher::medlineEsearch :")
+
+        "Get number of results for query 'query' in variable 'count'"
+        "Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
+
+        # print(query)
+        origQuery = query
+        query     = query.replace(' ', '%20')
+
+        eSearch   = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' \
+                     % ( self.pubMedEutilsURL, self.pubMedDB, query )
+
+        try:
+            eSearchResult = urlopen(eSearch)
+
+            data          = eSearchResult.read()
+            root          = etree.XML(data)
+
+            findcount     = etree.XPath("/eSearchResult/Count/text()")
+            count         = findcount(root)[0]
+
+            findquerykey  = etree.XPath("/eSearchResult/QueryKey/text()")
+            queryKey      = findquerykey(root)[0]
+
+            findwebenv    = etree.XPath("/eSearchResult/WebEnv/text()")
+            webEnv        = findwebenv(root)[0]
+
+        except Exception as Error:
+            print(Error)
+            count         = 0
+            queryKey      = False
+            webEnv        = False
+            origQuery     = False
+
+        values = { "query"    : origQuery
+                 , "count"    : int(count)
+                 , "queryKey" : queryKey
+                 , "webEnv"   : webEnv
+                 }
+        return values
+
+
+    # RETMAX:
+    # Total number of UIDs from the retrieved set to be shown in the XML output (default=20)
+    # maximum of 100,000 records
+    def medlineEfetchRAW( self , fullquery):
+
+        query    = fullquery [ "string"  ]
+        retmax   = fullquery [ "retmax"  ]
+        count    = fullquery [ "count"   ]
+        queryKey = fullquery [ "queryKey"]
+        webEnv   = fullquery [ "webEnv"  ]
+
+        "Fetch medline result for query 'query', saving results to file every 'retmax' articles"
+
+        queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
+
+        # print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
+
+        retstart = 0
+        eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
+        return eFetch
+
+
+    # generic!
+    def download(self, url):
+        print(url)
+        filename = download(url)
+        with self.lock:
+            print(threading.current_thread().name, filename+" OK")
+            return filename
+
+
+    # generic!
+    def do_work(self,item):
+        # time.sleep(1) # pretend to do some lengthy work.
+        returnvalue = self.medlineEsearch(item)
+        with self.lock:
+            # print(threading.current_thread().name, item)
+            return returnvalue
+
+    # The worker thread pulls an item from the queue and processes it
+    def worker(self):
+        while True:
+            item = self.q.get()
+            self.firstResults.append(self.do_work(item))
+            self.q.task_done()
+
+
+    def worker2(self):
+        while True:
+            item = self.q.get()
+            results = []
+            try:
+                result = self.download(item)
+            except Exception as error :
+                print(error)
+                result = False
+            self.firstResults.append(result)
+            self.q.task_done()
+
+
+    def chunks(self , l , n):
+        print("chunks:")
+        for i in range(0, len(l), n):
+            yield l[i:i+n]
+
+
+    # GLOBALLIMIT:
+    # I will retrieve this exact amount of publications.
+    # The publications per year i'll retrieve per year will be :
+    #        (k/N)*GlobalLimit
+    #                  \_ this is used as RETMAX
+    # - k : Number of publications of x year (according to pubmed)
+    # - N : Sum of every k belonging to {X} (total number of pubs according to pubmed)
+    # - GlobalLimit : Number of publications i want.
+    def serialFetcher(self , yearsNumber , query, globalLimit):
+
+        # Create the queue and thread pool.
+        for i in range(self.queue_size):
+             t = threading.Thread(target=self.worker)
+             t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+             t.start()
+        start = time.perf_counter()
+
+        N = 0
+
+        # print ("MedlineFetcher::serialFetcher :")
+        thequeries = []
+        globalresults = []
+        for i in range(yearsNumber):
+            year = str(2015 - i)
+            # print ('YEAR ' + year)
+            # print ('---------\n')
+            pubmedquery = str(year) + '[dp] '+query
+            self.q.put( pubmedquery ) #put task in the queue
+
+        self.q.join()
+        print('time:',time.perf_counter() - start)
+
+        Total = 0
+        Fails = 0
+        for globalresults in self.firstResults:
+            # globalresults = self.medlineEsearch(pubmedquery)
+            Total += 1
+            if globalresults["queryKey"]==False:
+                Fails += 1
+            if globalresults["count"] > 0 :
+
+                N+=globalresults["count"]
+
+                queryhyperdata = { "string"   : globalresults["query"]
+                                 , "count"    : globalresults["count"]
+                                 , "queryKey" : globalresults["queryKey"]
+                                 , "webEnv"   : globalresults["webEnv"]
+                                 , "retmax"   : 0
+                                 }
+                thequeries.append ( queryhyperdata )
+
+        print("Total Number:", N,"publications")
+        print("And i want just:",globalLimit,"publications")
+        print("---------------------------------------\n")
+
+        for i,query in enumerate(thequeries):
+            k                  = query["count"]
+            proportion         = k/float(N)
+            retmax_forthisyear = int(round(globalLimit*proportion))
+            query["retmax"]    = retmax_forthisyear
+
+            if query["retmax"] == 0 : query["retmax"]+=1
+
+            print(query["string"],"\t[",k,">",query["retmax"],"]")
+
+        if ((Fails+1)/(Total+1)) == 1 : # for identifying the epic fail or connection error
+            thequeries = [False]
+
+        return thequeries
--- a/gargantext/util/parsers/Cern.py
+++ b/gargantext/util/parsers/Cern.py
@@ -2,6 +2,8 @@ from ._Parser import Parser
 from datetime import datetime
 from bs4 import BeautifulSoup
 from lxml import etree
+#import asyncio
+#q = asyncio.Queue(maxsize=0)

 class CernParser(Parser):
    #mapping MARC21 ==> hyperdata
@@ -52,10 +54,15 @@ class CernParser(Parser):
        print("Date", hyperdata["publication_date"])
        return hyperdata

+    #@asyncio.coroutine
    def parse(self, file):
+        print("PARSING")
        hyperdata_list = []
        doc = file.read()
-        soup = BeautifulSoup(doc.decode("utf-8"), "lxml")
+        print(doc[:35])
+        soup = BeautifulSoup(doc, "lxml")
+
+        #print(soup.find("record"))
        for record in soup.find_all("record"):
            hyperdata = {v:[] for v in self.MARC21["100"].values()}
            hyperdata["uid"] = soup.find("controlfield").text

--- a/gargantext/util/parsers/Csv.py
+++ b/gargantext/util/parsers/Csv.py
+from ._Parser import Parser
+# from ..NgramsExtractors import *
+import sys
+import csv
+csv.field_size_limit(sys.maxsize)
+import numpy as np
+import os
+
+class CSVParser(Parser):
+
+    def CSVsample( self, small_contents , delim) :
+        reader = csv.reader(small_contents, delimiter=delim)
+
+        Freqs = []
+        for row in reader:
+            Freqs.append(len(row))
+
+        return Freqs
+
+
+    def parse(self, filebuf):
+
+        print("CSV: parsing (assuming UTF-8 and LF line endings)")
+
+        contents = filebuf.read().decode("UTF-8").split("\n")
+
+        sample_size = 10
+        sample_contents = contents[0:sample_size]
+
+        hyperdata_list = []
+
+        # # = = = = [ Getting delimiters frequency ] = = = = #
+        PossibleDelimiters = [ ',',' ','\t', ';', '|', ':' ]
+        AllDelimiters = {}
+        for delim in PossibleDelimiters:
+            AllDelimiters[delim] = self.CSVsample( sample_contents , delim )
+        # # = = = = [ / Getting delimiters frequency ] = = = = #
+        # # OUTPUT example:
+        # #  AllDelimiters = {
+        # #   '\t': [1, 1, 1, 1, 1],
+        # #   ' ': [1, 13, 261, 348, 330],
+        # #   ',': [15, 15, 15, 15, 15],
+        # #   ';': [1, 1, 1, 1, 1],
+        # #   '|': [1, 1, 1, 1, 1]
+        # #  }
+
+        # # = = = = [ Stand.Dev=0 & Sum of delimiters ] = = = = #
+        Delimiters = []
+        for d in AllDelimiters:
+            freqs = AllDelimiters[d]
+            suma = np.sum( freqs )
+            if suma >0:
+                std = np.std( freqs )
+                # print [ d , suma , len(freqs) , std]
+                if std == 0:
+                    Delimiters.append ( [ d , suma , len(freqs) , std] )
+        # # = = = = [ / Stand.Dev=0 & Sum of delimiters ] = = = = #
+        # # OUTPUT example:
+        # #  Delimiters = [
+        # #     ['\t', 5, 5, 0.0],
+        # #     [',', 75, 5, 0.0],
+        # #     ['|', 5, 5, 0.0]
+        # #  ]
+
+
+        # # = = = = [ Delimiter selection ] = = = = #
+        Sorted_Delims = sorted(Delimiters, key=lambda x: x[1], reverse=True)
+        HighestDelim = Sorted_Delims[0][0]
+        # HighestDelim = ","
+        print("CSV selected delimiter:",[HighestDelim])
+        # # = = = = [ / Delimiter selection ] = = = = #
+
+
+        # # = = = = [ First data coordinate ] = = = = #
+        Coords = {
+            "row": -1,
+            "column": -1
+        }
+
+        reader = csv.reader(contents, delimiter=HighestDelim)
+
+        for rownum, tokens in enumerate(reader):
+            if rownum % 250 == 0:
+                print("CSV row: ", rownum)
+            joined_tokens = "".join (tokens)
+            if Coords["row"]<0 and len( joined_tokens )>0 :
+                Coords["row"] = rownum
+                for columnum in range(len(tokens)):
+                    t = tokens[columnum]
+                    if len(t)>0:
+                        Coords["column"] = columnum
+                        break
+        # # = = = = [ / First data coordinate ] = = = = #
+
+
+
+        # # = = = = [ Setting Headers ] = = = = #
+        Headers_Int2Str = {}
+        reader = csv.reader(contents, delimiter=HighestDelim)
+        for rownum, tokens in enumerate(reader):
+            if rownum>=Coords["row"]:
+                for columnum in range( Coords["column"],len(tokens) ):
+                    t = tokens[columnum]
+                    Headers_Int2Str[columnum] = t
+                break
+        # print("Headers_Int2Str")
+        # print(Headers_Int2Str)
+        # # = = = = [ / Setting Headers ] = = = = #
+        # # OUTPUT example:
+        # #  Headers_Int2Str = {
+        # #     0: 'publication_date',
+        # #      1: 'publication_month',
+        # #      2: 'publication_second',
+        # #      3: 'abstract'
+        # #  }
+
+
+        # # = = = = [ Reading the whole CSV and saving ] = = = = #
+        hyperdata_list = []
+        reader = csv.reader(contents, delimiter=HighestDelim)
+        for rownum, tokens in enumerate(reader):
+            if rownum>Coords["row"]:
+                RecordDict = {}
+                for columnum in range( Coords["column"],len(tokens) ):
+                    data = tokens[columnum]
+                    RecordDict[ Headers_Int2Str[columnum] ] = data
+                if len(RecordDict.keys())>0:
+                    hyperdata_list.append( RecordDict )
+        # # = = = = [ / Reading the whole CSV and saving ] = = = = #
+
+        return hyperdata_list
--- a/gargantext/util/parsers/Istex.py
+++ b/gargantext/util/parsers/Istex.py
+from ._Parser import Parser
+from datetime import datetime
+from io import BytesIO
+import json
+
+class ISTexParser(Parser):
+
+    def parse(self, filebuf):
+        contents = filebuf.read().decode("UTF-8")
+        data = json.loads(contents)
+        filebuf.close()
+        json_docs = data["hits"]
+        hyperdata_list = []
+        hyperdata_path = {
+            "id"                : "id",
+            "source"           : 'corpusName',
+            "title"             : 'title',
+            "genre"             : "genre",
+            "language_iso3"     : 'language',
+            "doi"               : 'doi',
+            "host"              : 'host',
+            "publication_date"  : 'publicationDate',
+            "abstract"  : 'abstract',
+            # "authors"           : 'author',
+            "authorsRAW"        : 'author',
+            "keywords"          : "keywords"
+        }
+
+        suma = 0
+
+        for json_doc in json_docs:
+            hyperdata = {}
+            for key, path in hyperdata_path.items():
+                try:
+                    # print(path," ==> ",len(json_doc[path]))
+                    hyperdata[key] = json_doc[path]
+                except:
+                    pass
+
+            # print("|",hyperdata["language_iso3"])
+
+            if "doi" in hyperdata:
+                hyperdata["doi"] = hyperdata["doi"][0]
+
+            keywords = []
+            if "keywords" in hyperdata:
+                for keyw in hyperdata["keywords"]:
+                    keywords.append(keyw["value"] )
+                hyperdata["keywords"] = ", ".join( keywords )
+
+            moredate=False
+            moresource=False
+            if "host" in hyperdata:
+
+                if "genre" in hyperdata["host"] and len(hyperdata["host"]["genre"])>0:
+                    if "genre" in hyperdata and len(hyperdata["genre"])==0:
+                        hyperdata["genre"] = hyperdata["host"]["genre"]
+
+                # print(hyperdata["host"])
+                if "pubdate" in hyperdata["host"]:
+                    onebuffer = hyperdata["publication_date"]
+                    hyperdata["publication_date"] = []
+                    hyperdata["publication_date"].append(onebuffer)
+                    hyperdata["publication_date"].append( hyperdata["host"]["pubdate"] )
+
+                if "title" in hyperdata["host"]:
+                    hyperdata["journal"] = hyperdata["host"]["title"]
+
+            authors=False
+            if "authorsRAW" in hyperdata:
+                names = []
+                for author in hyperdata["authorsRAW"]:
+                    names.append(author["name"])
+                hyperdata["authors"] = ", ".join(names)
+
+            if "host" in hyperdata: hyperdata.pop("host")
+            if "genre" in hyperdata:
+                if len(hyperdata["genre"])==0:
+                    hyperdata.pop("genre")
+            if "language_iso3" in hyperdata:
+                # retrieve lang if lang != [] and lang != ["unknown"]
+                # ---------------------------------------------------
+                if len(hyperdata["language_iso3"])>0 and hyperdata["language_iso3"][0] != "unknown" :
+                    hyperdata["language_iso3"] = hyperdata["language_iso3"][0]
+
+                # default value = eng
+                # possible even better: langid.classify(abstract)
+                else:
+                    # NB 97% des docs istex sont eng donc par défaut
+                    # ----------------------------------------------
+                    hyperdata["language_iso3"] = "eng"
+                    # (cf. api.istex.fr/document/?q=*&facet=language
+                    #  et  tests langid sur les language=["unknown"])
+
+
+            if "publication_date" in hyperdata:
+                RealDate = hyperdata["publication_date"]
+                if "publication_date" in hyperdata:
+                    hyperdata.pop("publication_date")
+
+                if isinstance(RealDate, list):
+                    RealDate = RealDate[0]
+
+                # print( RealDate ," | length:",len(RealDate))
+                Decision=""
+                if len(RealDate)>4:
+                    if len(RealDate)>8:
+                        try: Decision = datetime.strptime(RealDate, '%Y-%b-%d').date()
+                        except:
+                            try: Decision = datetime.strptime(RealDate, '%Y-%m-%d').date()
+                            except: Decision=False
+                    else:
+                        try: Decision = datetime.strptime(RealDate, '%Y-%b').date()
+                        except:
+                            try: Decision = datetime.strptime(RealDate, '%Y-%m').date()
+                            except: Decision=False
+                else:
+                    try: Decision = datetime.strptime(RealDate, '%Y').date()
+                    except: Decision=False
+
+                if Decision!=False:
+                    hyperdata["publication_year"] = str(Decision.year)
+                    hyperdata["publication_month"] = str(Decision.month)
+                    hyperdata["publication_day"] = str(Decision.day)
+                    hyperdata_list.append(hyperdata)
+                    # print("\t||",hyperdata["title"])
+                    # print("\t\t",Decision)
+                    # print("=============================")
+                # else:
+                #     suma+=1
+                #     if "pubdate" in json_doc:
+                #         print ("\tfail pubdate:",json_doc["pubdate"])
+
+
+        # print ("nb_hits:",len(json_docs))
+        # print("\t - nb_fails:",suma)
+        # print("  -- - - - - - -- - -")
+
+        return hyperdata_list
--- a/gargantext/util/parsers/Pubmed.py
+++ b/gargantext/util/parsers/Pubmed.py
@@ -31,6 +31,7 @@ class PubmedParser(Parser):
        if isinstance(file, bytes):
            file = BytesIO(file)
        xml = etree.parse(file, parser=self.xml_parser)
+        #print(xml.find("PubmedArticle"))
        xml_articles = xml.findall('PubmedArticle')
        # initialize the list of hyperdata
        hyperdata_list = []

--- a/gargantext/util/parsers/Repec.py
+++ b/gargantext/util/parsers/Repec.py
+from ._Parser import Parser
+
+from gargantext.util.languages import languages
+
+#from admin.utils import PrintException
+
+class RepecParser(Parser):
+
+#    def __init__(self, language_cache=None):
+#
+#        #super(Parser, self).__init__()
+#        #super(Parser, self).__init__()
+#        self._languages_cache = LanguagesCache() if language_cache is None else language_cache
+
+
+    _begin = 6
+    _parameters = {
+        b"ER":  {"type": "delimiter"},
+        b"T1":  {"type": "hyperdata", "key": "title", "separator": " "},
+        b"ST":  {"type": "hyperdata", "key": "subtitle", "separator": " "},
+        b"A1":  {"type": "hyperdata", "key": "authors", "separator": "\n"},
+        b"JO":  {"type": "hyperdata", "key": "journal"},
+        b"UR":  {"type": "hyperdata", "key": "doi"},
+        b"Y1":  {"type": "hyperdata", "key": "publication_year"},
+        b"PD":  {"type": "hyperdata", "key": "publication_month"},
+        b"N1":  {"type": "hyperdata", "key": "references", "separator": ", "},
+        b"LA":  {"type": "hyperdata", "key": "language_iso2"},
+        b"N2":  {"type": "hyperdata", "key": "abstract", "separator": " "},
+        b"WC":  {"type": "hyperdata", "key": "fields"},
+    }
+
+    def parse(self, file):
+
+        hyperdata = {}
+        last_key = None
+        last_values = []
+        # browse every line of the file
+        for line in file:
+            if len(line) > 2 :
+                # extract the parameter key
+                parameter_key = line[:2]
+                if parameter_key != b'  ' and parameter_key != last_key:
+                    if last_key in self._parameters:
+                        # translate the parameter key
+                        parameter = self._parameters[last_key]
+                        if parameter["type"] == "hyperdata":
+                            separator = parameter["separator"] if "separator" in parameter else ""
+                            if parameter["key"] == "publication_year":
+                                hyperdata[parameter["key"]] = separator.join(last_values)[:4]
+                            else:
+                                hyperdata[parameter["key"]] = separator.join(last_values)
+                        elif parameter["type"] == "delimiter":
+                            if 'language_fullname' not in hyperdata.keys():
+                                if 'language_iso3' not in hyperdata.keys():
+                                    if 'language_iso2' not in hyperdata.keys():
+                                        hyperdata['language_iso2'] = 'en'
+                            yield hyperdata
+                            hyperdata = {}
+                    last_key = parameter_key
+                    last_values = []
+                try:
+                    last_values.append(line[self._begin:-1].decode())
+                except Exception as error:
+                    print(error)
+        # if a hyperdata object is left in memory, yield it as well
+        if hyperdata:
+            yield hyperdata
--- a/gargantext/util/parsers/_Parser.py
+++ b/gargantext/util/parsers/_Parser.py
@@ -20,14 +20,9 @@ class Parser:
            self._file = file

    def __del__(self):
-        self._file.close()
+        if hasattr(self, '_file'):
+            self._file.close()

-    def detect_format(self, afile, a_formats):
-        #import magic
-        print("Detecting format")
-        #print(magic.from_file(afile))
-
-        return

    def detect_encoding(self, string):
        """Useful method to detect the encoding of a document.
@@ -167,6 +162,8 @@ class Parser:

    def __iter__(self, file=None):
        """Parse the file, and its children files found in the file.
+        C24B comment: le stokage/extraction du fichier devrait être faite en amont
+        et cette methode est un peu obscure
        """
        if file is None:
            file = self._file

--- a/gargantext/util/parsers/__init__.py
+++ b/gargantext/util/parsers/__init__.py
-from .Ris       import RISParser
-from .Ris_repec import RepecParser
-from .Isi       import ISIParser
-# from .Jstor import JstorParser
-# from .Zotero import ZoteroParser
-from .Pubmed    import PubmedParser
+import importlib
+from gargantext.constants import RESOURCETYPES
+from gargantext.settings import DEBUG
+#if DEBUG:
+#    print("Loading available PARSERS:")
+base_parser = "gargantext.util.parsers"
+for resource in RESOURCETYPES:
+    if resource["parser"] is not None:
+        #parser file is without Parser
+        try:
+            fname = resource["parser"].replace("Parser", "")
+            #parser file is formatted as a title
+            module = base_parser+".%s" %(fname.title())
+            #parser module is has shown in constants
+            parser = importlib.import_module(module)
+            #if DEBUG:
+            #    print("\t-", resource["parser"])
+        #getattr(parser,resource["parser"])

-# # 2015-12-08: parser 2 en 1
-from .Europress import EuropressParser
-
-from .ISTex     import ISTexParser
-from .CSV       import CSVParser
-from .Cern      import CernParser
+        except Exception as e:
+            print("Check constants.py %s \nLANGUAGES declaration of taggers. Parser %s is not available" %(str(e), resource["parser"]))