[REPEC/PARSERS] Some fixes (filepath, type corrections in others parsers.

44b444e1 · delanoe · 7eadf91e · 44b444e1 · 44b444e1 · 44b444e1
Commit 44b444e1 authored Apr 12, 2017 by delanoe
Showing with 87 additions and 13 deletions

MULTIVAC.py gargantext/util/crawlers/MULTIVAC.py +11 -11

ISTEX.py gargantext/util/parsers/ISTEX.py +1 -1

MULTIVAC.py gargantext/util/parsers/MULTIVAC.py +74 -0

PUBMED.py gargantext/util/parsers/PUBMED.py +1 -1

No files found.
--- a/gargantext/util/crawlers/MULTIVAC.py
+++ b/gargantext/util/crawlers/MULTIVAC.py
@@ -8,8 +8,10 @@

 from ._Crawler import *
 import json
-from gargantext.settings import API_TOKENS
+from gargantext.settings  import API_TOKENS
+from gargantext.constants import UPLOAD_DIRECTORY
 from math import trunc
+from gargantext.util.files import save

 class MultivacCrawler(Crawler):
    ''' Multivac API CLIENT'''
@@ -79,8 +81,9 @@ class MultivacCrawler(Crawler):
        return self.results_nb

    def download(self, query):
-        self.path = "/tmp/MultivacResults.xml"
+        
        downloaded = False
+        
        self.status.append("fetching results")

        corpus = []
@@ -92,15 +95,12 @@ class MultivacCrawler(Crawler):
            print("ERROR (scrap: multivac d/l ): ",msg)
            self.query_max = QUERY_SIZE_N_MAX
        
-        
-        with open(self.path, 'wb') as f:
-            #for page in range(1, self.query_max, paging):
-            for page in range(1, trunc(self.query_max / 100) + 1):
-                docs = self._get(query, fromPage=page, count=paging)["results"]["hits"]
-                for doc in docs:
-                    corpus.append(doc)
+        for page in range(1, trunc(self.query_max / 100) + 1):
+            docs = self._get(query, fromPage=page, count=paging)["results"]["hits"]
+            for doc in docs:
+                corpus.append(doc)

-            f.write(json.dumps(corpus).encode("utf-8"))
-            downloaded = True
+        self.path = save(json.dumps(corpus).encode("utf-8"), name='Multivac', basedir=UPLOAD_DIRECTORY )
+        downloaded = True
        
        return downloaded
--- a/gargantext/util/parsers/ISTEX.py
+++ b/gargantext/util/parsers/ISTEX.py
@@ -104,7 +104,7 @@ class ISTexParser(Parser):
                    RealDate = RealDate[0]

                # print( RealDate ," | length:",len(RealDate))
-                Decision=""
+                Decision = True
                if len(RealDate)>4:
                    if len(RealDate)>8:
                        try: Decision = datetime.strptime(RealDate, '%Y-%b-%d').date()

--- a/gargantext/util/parsers/MULTIVAC.py
+++ b/gargantext/util/parsers/MULTIVAC.py
+from ._Parser import Parser
+from datetime import datetime
+import json
+
+class MultivacParser(Parser):
+
+    def parse(self, filebuf):
+        '''
+        parse :: FileBuff -> [Hyperdata]
+        '''
+        contents = filebuf.read().decode("UTF-8")
+        data = json.loads(contents)
+        
+        filebuf.close()
+        
+        json_docs = data
+        hyperdata_list = []
+        
+        hyperdata_path = {
+            "id"                : "id",
+            "title"             : "title",
+            "abstract"          : "abstract",
+            "type"              : "type"
+        }
+
+        suma = 0
+        
+        for json_doc in json_docs:
+
+            hyperdata = {}
+            
+            doc = json_doc["_source"]
+
+            for key, path in hyperdata_path.items():
+                    hyperdata[key] = doc.get(path, "")
+            
+            hyperdata["source"] = doc.get("serial"      , {})\
+                                     .get("journaltitle", "REPEC Database")
+            
+            try:
+                hyperdata["url"]    = doc.get("file", {})\
+                                         .get("url" , "")
+            except:
+                pass
+
+            hyperdata["authors"] = ", ".join(
+                                             [ p.get("person", {})
+                                                .get("name"  , "")
+                          
+                                               for p in doc.get("hasauthor", [])
+                                             ]
+                                            )
+            
+
+            year = doc.get("serial"  , {})\
+                      .get("issuedate", None)
+            
+            if year is None:
+                year = datetime.now()
+            else:
+                try:
+                    date = datetime.strptime(year, '%Y')
+                except:
+                    print("FIX DATE MULTIVAC REPEC %s" % year)
+                    year = datetime.now()
+
+            hyperdata["publication_date"] = date
+            hyperdata["publication_year"]  = str(date.year)
+            hyperdata["publication_month"] = str(date.month)
+            hyperdata["publication_day"]   = str(date.day)
+            
+            hyperdata_list.append(hyperdata)
+        
+        return hyperdata_list
--- a/gargantext/util/parsers/PUBMED.py
+++ b/gargantext/util/parsers/PUBMED.py
@@ -78,7 +78,7 @@ class PubmedParser(Parser):
            if "publication_month" in hyperdata: PubmedDate+=" "+hyperdata["publication_month"]
            if "publication_day" in hyperdata: PubmedDate+=" "+hyperdata["publication_day"]

-            Decision=""
+            Decision=True
            if len(RealDate)>4:
                if len(RealDate)>8:
                    try: Decision = datetime.strptime(RealDate, '%Y %b %d').date()