[CERN XML PARSER] OK

f197ccb4 · c24b · 4454aa47 · f197ccb4 · f197ccb4
Commit f197ccb4 authored May 12, 2016 by c24b
Hide whitespace changes
Inline Side-by-side

Showing with 61 additions and 65 deletions

Cern.py gargantext/util/parsers/Cern.py +53 -60

_Parser.py gargantext/util/parsers/_Parser.py +8 -5

No files found.
--- a/gargantext/util/parsers/Cern.py
+++ b/gargantext/util/parsers/Cern.py
 from ._Parser import Parser
 from datetime import datetime
 from bs4 import BeautifulSoup
-#from io import BytesIO
-from io import StringIO
-import json
 from lxml import etree

 class CernParser(Parser):
@@ -11,90 +8,86 @@ class CernParser(Parser):
    MARC21 = {
            #here main author
            "100":{
-                    "a": "author_name",
-                    "v": "author_affiliation",
-                    "w": "author_country",
-                    "m": "author_mail",
+                    "a": "authors",
+                    "v": "authors_affiliations",
+                    "w": "authors_countries",
+                    "m": "authors_mails",
                    },
-            #here cooauthor
+            #here cooauthor mais rappatrié vers la list  l'auteur avec main author [0]
            "700": {
-                    "a": "authors_name",
-                    "v": "authors_affiliation",
-                    "w": "authors_country",
+                    "a": "authors",
+                    "v": "authors_affiliations",
+                    "w": "authors_countries",
                    },
            "773":{
                    "c": "pages",
                    "n": "issue",
                    "p": "journal",
                    "v": "volume",
-                    "y": "year"
+                    "y": "publication_year"
                    },
            "024": {"a":"doi"},
-            "037": {"a":"arxiv"},
-            "022": {"a":"isbn"},
+            #"037": {"a":"arxiv"},
+            #"022": {"a":"isbn"},
            "245": {"a":"title"},
            "520": {"a":"abstract"},
-            "260": {"b":"publisher","c":"pubdate"},
-            #"024": {"t":"date"},
+            "260": {"b":"publisher","c":"publication_date"},
+            "024": {"t":"realdate_full_"}, #correspond to query date
            #"540": {"a":"licence"},
            #"653": {"a":"keywords"},
-            #"856": {"u":"pdf_source"},
+            "856": {"u":"pdf_source"},
            }
-    #~ hyperdata_item = {
-        #~ "journal"           : '',
-        #~ "title"             : '',
-        #~ "abstract"          : '',
-        #~ "title"            : '',
-        #~ "language_iso2"     : 'en',
-        #~ "doi"               : '',
-        #~ "realdate_full_"     : '',
-        #~ "realdate_year_"     : '',
-        #~ "realdate_month_"    : '',
-        #~ "realdate_day_"      : '',
-        #~ "publication_year"  : '',
-        #~ "publication_month" : '',
-        #~ "publication_day"   : '',
-        #~ "authors"           : '',
-        #~ "authors_countries" : '',
-        #~ "authors_affiliations": '',
-        #~ "publisher": '',
-    #~ }
+
+    def format_date(self, hyperdata):
+        '''formatting pubdate'''
+        prefix = "publication"
+        date = datetime.strptime(hyperdata[prefix + "_date"], "%Y-%m-%d")
+        hyperdata[prefix + "_year"]      = date.strftime('%Y')
+        hyperdata[prefix + "_month"]     = date.strftime("%m")
+        hyperdata[prefix + "_day"]       = date.strftime("%d")
+        hyperdata[prefix + "_hour"]      = date.strftime("%h")
+        hyperdata[prefix + "_minute"]    = date.strftime("%m")
+        hyperdata[prefix + "_second"]    = date.strftime("%s")
+        hyperdata[prefix + "_date"]  = date.strftime("%Y-%m-%d %H:%M:%S")
+        return hyperdata

    def parse(self, file):
-        if isinstance(file, str):
-            file = open(file, 'rb')
-        doc = etree.parse(file.read())
-        tree = etree.tostring(doc)
-        #parser = etree.XMLParser()
-        hyperdata_list =[]
-        soup = BeautifulSoup(tree, "lxml")
+        hyperdata_list = []
+        doc = file.read()
+        soup = BeautifulSoup(doc.decode("utf-8"), "lxml")
        for record in soup.find_all("record"):
-            r = {v:[] for v in self.MARC21["700"].values()}
-            r["uid"]  = soup.find("controlfield").text
+            hyperdata = {v:[] for v in self.MARC21["100"].values()}
+            hyperdata["uid"] = soup.find("controlfield").text
+            hyperdata["language_iso2"] = "en"
            for data in soup.find_all("datafield"):
                tag = data.get("tag")
                if tag in self.MARC21.keys():
                    for sub in data.find_all("subfield"):
                        code = sub.get("code")
                        if code in self.MARC21[tag].keys():
-                            if tag == "700":
-                                r[self.MARC21[tag][code]].append(sub.text)
                            if tag == "100":
-                                r[self.MARC21["700"][code]].insert(0,sub.text)
+                                try:
+                                    hyperdata[self.MARC21["100"][code]].insert(0,sub.text)
+                                except AttributeError:
+                                    hyperdata[self.MARC21["100"][code]] = [sub.text]
+                                #print ("1", self.MARC21["100"][code], hyperdata[self.MARC21["100"][code]])
+
+                            elif tag == "700":
+                                #print ("7", self.MARC21["100"][code], hyperdata[self.MARC21["100"][code]])
+                                try:
+                                    hyperdata[self.MARC21["100"][code]].append(sub.text)
+                                except AttributeError:
+                                    hyperdata[self.MARC21["100"][code]] = [sub.text]
                            else:
-                                r[self.MARC21[tag][code]] = sub.text
-            print(r)
-            #hyperdata_list.append(r["uid.decode('utf-8'))
-            break
+                                hyperdata[self.MARC21[tag][code]] = sub.text
+
+            hyperdata["authors_countries"] = (",").join(hyperdata["authors_countries"])
+            hyperdata["authors_affiliations"] = (",").join(hyperdata["authors_affiliations"])
+            hyperdata["authors"] = (",").join(hyperdata["authors"])
+            hyperdata["authors_mails"] = (",").join(hyperdata["authors_mails"])
+            hyperdata = self.format_date(hyperdata)
+            hyperdata_list.append(hyperdata)
        return hyperdata_list

 if __name__ == "__main__":
    pass
-    #~ e = CernParser()
-    #~ hyperdata = e.parse(str(sys.argv[1]))
-    #~ for h in hyperdata:
-        #~ try:
-            #~ print(h['journal'], ":", h['publication_date'])
-        #~ except:
-            #~ pass
-        #~ break
--- a/gargantext/util/parsers/_Parser.py
+++ b/gargantext/util/parsers/_Parser.py
@@ -2,7 +2,6 @@ import datetime
 import dateutil.parser
 import zipfile
 import re
-
 import dateparser as date_parser
 from gargantext.util.languages import languages

@@ -23,8 +22,12 @@ class Parser:
    def __del__(self):
        self._file.close()

-    def detect_format(self, accepted_format):
-        print(self._file[:1000])
+    def detect_format(self, afile, a_formats):
+        #import magic
+        print("Detecting format")
+        #print(magic.from_file(afile))
+
+        return

    def detect_encoding(self, string):
        """Useful method to detect the encoding of a document.
@@ -110,10 +113,10 @@ class Parser:
            hyperdata[prefix + "_hour"]      = date.strftime("%H")
            hyperdata[prefix + "_minute"]    = date.strftime("%M")
            hyperdata[prefix + "_second"]    = date.strftime("%S")
-
+        print(hyperdata['publication_date'])
        # finally, return the transformed result!
        return hyperdata
-        print(hyperdata['publication_date'])
+

    def format_hyperdata_languages(self, hyperdata):
        """format the languages found in the hyperdata."""