Referencing the Parser

d19c6877 · c24b · 0950a1ac · d19c6877 · d19c6877
Commit d19c6877 authored May 10, 2016 by c24b
Hide whitespace changes
Inline Side-by-side

Showing with 57 additions and 1 deletion

Cern.py gargantext/util/parsers/Cern.py +56 -0

__init__.py gargantext/util/parsers/__init__.py +1 -1

No files found.
--- a/gargantext/util/parsers/Cern.py
+++ b/gargantext/util/parsers/Cern.py
+from ._Parser import Parser
+from datetime import datetime
+from io import BytesIO
+import json
+class CernParser(Parser):
+    self.MARC21 = {
+            "100":{"a": "author_name",
+                        "v": "author_affiliation",
+                        "w": "author_country",
+                        "m": "author_mail",
+                        },
+            "700": {"a": "coauthor_name",
+                    "v": "coauthor_affiliation",
+                    "w": "coauthor_country",
+                    },
+            "773":{ "c": "pages",
+                    "n": "issue",
+                    "p": "journal",
+                    "v": "volume",
+                    "y": "year"
+                    },
+            "024": {"a":"DOI"},
+            "037": {"a":"ArXiv"},
+            "022": {"a":"ISSN"},
+            "245": {"a":"Title"},
+            "520": {"a":"Abstract"},
+            "260": {"b":"Publisher","c":"Pubdate"},
+            "024": {"t":"Date"},
+            "540": {"a":"Licence"},
+            "653": {"a":"keywords"},
+            "856": {"u":"pdf_source"},
+            }
+    def parse(self, filebuf):
+        tree = etree.tostring(filebuf)
+        #root = tree.getroot()
+        soup = BeautifulSoup(tree, "lxml")
+        for record in soupr.find_all("record"):
+            r = {v:[] for v in self.MARC21["700"].values()}
+            r["uid"]  = soup.find("controlfield").text
+            for data in soup.find_all("datafield"):
+                tag = data.get("tag")
+                if tag in self.MARC21.keys():
+                    for sub in data.find_all("subfield"):
+                        code = sub.get("code")
+                        if code in self.MARC21[tag].keys():
+                            if tag == "700":
+                                r[self.MARC21[tag][code]].append(sub.text)
+                            else:
+                                r[self.MARC21[tag][code]] = sub.text
+            records.append(r.decode('utf-8'))
--- a/gargantext/util/parsers/__init__.py
+++ b/gargantext/util/parsers/__init__.py
@@ -9,4 +9,4 @@ from .Europress import EuropressParser
 from .ISTex import ISTexParser
 from .CSV import CSVParser
-from .CERN  import CernParser
+from .Cern  import CernParser