[UPDATE] merge conflict

5160d178 · PkSM3 · 0853ddca · 2644e642 · 5160d178 · 5160d178
Commit 5160d178 authored May 19, 2015 by PkSM3
11 changed files
--- a/init/sql/drop_db.sh
+++ b/init/sql/drop_db.sh
+
+export PGPASSWORD=C8kdcUrAQy66U 
+
+psql -U gargantua -d gargandb -f drop_db.sql
--- a/init/sql/init.sql
+++ b/init/sql/init.sql
--- a/init/sql/rename_metadata.sql
+++ b/init/sql/rename_metadata.sql
+
+ALTER TABLE node_node RENAME metadata TO hyperdata ;
+
+ALTER TABLE node_metadata RENAME TO node_hyperdata ;
+
+ALTER TABLE node_node_metadata RENAME TO node_node_hyperdata ;
+ALTER TABLE node_node_hyperdata RENAME metadata_id TO  hyperdata_id ;
+
--- a/node/models.py
+++ b/node/models.py
@@ -306,7 +306,6 @@ class Node(CTENode):
        self.hyperdata['Processing'] = 0
        self.save()

-
 class Node_Hyperdata(models.Model):
    node        = models.ForeignKey(Node, on_delete=models.CASCADE)
    hyperdata    = models.ForeignKey(Hyperdata)

--- a/parsing/FileParsers/EuropressFileParser.py
+++ b/parsing/FileParsers/EuropressFileParser.py
@@ -8,7 +8,7 @@ import dateutil.parser
 from .FileParser import FileParser
 from ..NgramsExtractors import *

-
+from admin.utils import PrintException

 class EuropressFileParser(FileParser):

@@ -29,8 +29,8 @@ class EuropressFileParser(FileParser):
        if encoding != "utf-8":
            try:
                contents = contents.decode("latin1", errors='replace').encode(codif)
-            except Exception as error:
-                print(error)
+            except:
+                PrintException()
 #                try:
 #                    contents = contents.decode(encoding, errors='replace').encode(codif)
 #                except Exception as error:
@@ -40,7 +40,7 @@ class EuropressFileParser(FileParser):
            html_parser = etree.HTMLParser(encoding=codif)
            html = etree.fromstring(contents, html_parser)

-            try:
+            try :

                format_europresse = 50
                html_articles = html.xpath('/html/body/table/tbody')
@@ -51,15 +51,15 @@ class EuropressFileParser(FileParser):
                    if len(html_articles) < 1:
                        format_europresse = 1
                        html_articles = html.xpath('//div[@id="docContain"]')
-            except Exception as error:
-                print(error)
+            except :
+                PrintException()

-            if format_europresse == 50:
+            if format_europresse == 50 :
                name_xpath      = "./tr/td/span[@class = 'DocPublicationName']"
-                header_xpath = "//span[@class = 'DocHeader']"
+                header_xpath    = "./tr/td/span[@class = 'DocHeader']"
                title_xpath     = "string(./tr/td/span[@class = 'TitreArticleVisu'])"
                text_xpath      = "./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
-            elif format_europresse == 1:
+            elif format_europresse == 1 :
                name_xpath      = "//span[@class = 'DocPublicationName']"
                header_xpath    = "//span[@class = 'DocHeader']"
                title_xpath     = "string(//div[@class = 'titreArticleVisu'])"
@@ -79,8 +79,8 @@ class EuropressFileParser(FileParser):
                doi_xpath  = "//span[@id='ucPubliC_lblNodoc']/text()"


-        except Exception as error:
-            print(error)
+        except Exception as error :
+            PrintException()

        # parse all the articles, one by one
        try:
@@ -99,7 +99,18 @@ class EuropressFileParser(FileParser):
                            else:
                                hyperdata['journal'] = name.text.encode(codif)

+                    countbis = 0
+
                    for header in html_article.xpath(header_xpath):
+#                        print(count)
+#                        countbis += 1
+
+#                        try:
+#                            print('109', hyperdata['publication_date'])
+#                        except:
+#                            print('no date yet')
+#                            pass
+
                        try:
                            text = header.text
                            #print("header", text)
@@ -145,12 +156,10 @@ class EuropressFileParser(FileParser):
                                        hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
                                        # hyperdata['publication_date'] = dateutil.parser.parse(text)
                                    except Exception as error:
-                                        print(error)
-                                        print(text)
+                                        print(error, text)
                                        pass


-                        
                        if test_date_en is not None:
                            localeEncoding = "en_GB.UTF-8"
                            locale.setlocale(locale.LC_ALL, localeEncoding)
@@ -168,6 +177,13 @@ class EuropressFileParser(FileParser):
                        if test_page is not None:
                            hyperdata['page'] = test_page.group(1).encode(codif)

+#                    try:
+#                        print('183', hyperdata['publication_date'])
+#                    except:
+#                        print('no date yet')
+#                        pass
+#
+
                    hyperdata['title'] = html_article.xpath(title_xpath).encode(codif)
                    hyperdata['abstract']  = html_article.xpath(text_xpath)

@@ -215,7 +231,7 @@ class EuropressFileParser(FileParser):
                    hyperdata['publication_year']  = hyperdata['publication_date'].strftime('%Y')
                    hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
                    hyperdata['publication_day']  = hyperdata['publication_date'].strftime('%d')
-                    hyperdata.pop('publication_date')
+                    #hyperdata.pop('publication_date')

                    if len(hyperdata['abstract'])>0 and format_europresse == 50:
                        hyperdata['doi'] = str(hyperdata['abstract'][-9])

--- a/parsing/FileParsers/FileParser.py
+++ b/parsing/FileParsers/FileParser.py
 import collections
+import datetime
 import dateutil.parser
 import zipfile
 import chardet
+import re

 from ..Caches import LanguagesCache


+DEFAULT_DATE = datetime.datetime(datetime.MINYEAR, 1, 1)
+
+
 class FileParser:
    """Base class for performing files parsing depending on their type.
    """
@@ -29,6 +34,21 @@ class FileParser:
        """

        # First, check the split dates...
+        date_string = hyperdata.get('publication_date_to_parse', None)
+        if date_string is not None:
+            date_string = re.sub(r'\/\/+', '', date_string)
+            date_string = re.sub(r'undefined', '', date_string)
+            try:
+                hyperdata['publication' + "_date"] = dateutil.parser.parse(
+                    date_string,
+                    default=DEFAULT_DATE
+                ).strftime("%Y-%m-%d %H:%M:%S")
+            except:
+                print('Parser Zotero, Date not parsed for:', date_string)
+                hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+
+        elif hyperdata.get('publication_year', None) is not None:
            prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
            for prefix in prefixes:
                date_string = hyperdata[prefix + "_year"]
@@ -51,11 +71,15 @@ class FileParser:
                    hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
                except:
                    pass
+        else:
+            hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        # ...then parse all the "date" fields, to parse it into separate elements
        prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"]
        for prefix in prefixes:
            date = dateutil.parser.parse(hyperdata[prefix + "_date"])
+            #print(date)
+
            hyperdata[prefix + "_year"]      = date.strftime("%Y")
            hyperdata[prefix + "_month"]     = date.strftime("%m")
            hyperdata[prefix + "_day"]       = date.strftime("%d")
@@ -65,6 +89,7 @@ class FileParser:

        # finally, return the transformed result!
        return hyperdata
+        print(hyperdata['publication_date'])

    def format_hyperdata_languages(self, hyperdata):
        """format the languages found in the hyperdata."""

--- a/parsing/FileParsers/RisFileParser.py
+++ b/parsing/FileParsers/RisFileParser.py
@@ -3,6 +3,8 @@ from .FileParser import FileParser

 from ..Caches import LanguagesCache

+from admin.utils import PrintException
+
 class RisFileParser(FileParser):

    def __init__(self, language_cache=None):
@@ -57,5 +59,11 @@ class RisFileParser(FileParser):
                    print(error)
        # if a hyperdata object is left in memory, yield it as well
        if hyperdata:
+#            try:
+#                if hyperdata['date_to_parse']:
+#                    print(hyperdata['date_to_parse'])
+#            except:
+#                pass
+#
            #print(hyperdata['title'])
            yield hyperdata
--- a/parsing/FileParsers/ZoteroFileParser.py
+++ b/parsing/FileParsers/ZoteroFileParser.py
+from .RisFileParser import RisFileParser
+
+from ..Caches import LanguagesCache
+
+class ZoteroFileParser(RisFileParser):
+    def __init__(self):
+        super(RisFileParser, self).__init__()
+
+        self._begin = 6
+
+        self._parameters = {
+            b"ER":  {"type": "delimiter"},
+            b"TI":  {"type": "hyperdata", "key": "title", "separator": " "},
+            b"AU":  {"type": "hyperdata", "key": "authors", "separator": ", "},
+            b"UR":  {"type": "hyperdata", "key": "doi"},
+            b"DA":  {"type": "hyperdata", "key": "publication_date_to_parse"},
+            b"PY":  {"type": "hyperdata", "key": "publication_year"},
+            b"PD":  {"type": "hyperdata", "key": "publication_month"},
+            b"LA":  {"type": "hyperdata", "key": "language_iso2"},
+            b"AB":  {"type": "hyperdata", "key": "abstract", "separator": " "},
+            b"WC":  {"type": "hyperdata", "key": "fields"},
+        }
+
--- a/parsing/FileParsers/__init__.py
+++ b/parsing/FileParsers/__init__.py
 from .RisFileParser import RisFileParser
 from .IsiFileParser import IsiFileParser
 from .JstorFileParser import JstorFileParser
+from .ZoteroFileParser import ZoteroFileParser
 from .PubmedFileParser import PubmedFileParser
 from .EuropressFileParser import EuropressFileParser
 from .ISText import ISText

--- a/parsing/corpustools.py
+++ b/parsing/corpustools.py
--- a/parsing/parsers_config.py
+++ b/parsing/parsers_config.py
@@ -4,12 +4,11 @@ parsers = {
        'Pubmed (xml format)'               : PubmedFileParser,
        'Web of Science (ISI format)'       : IsiFileParser,
        'Scopus (RIS format)'               : RisFileParser,
-        'Zotero (RIS format)'               : JstorFileParser,
+        'Zotero (RIS format)'               : ZoteroFileParser,
        'Jstor (RIS format)'                : JstorFileParser,
        #'Europress'                        : EuropressFileParser,
        'Europress (French)'                : EuropressFileParser,
        'Europress (English)'               : EuropressFileParser,
        'CSVParser'                : CSVParser,
-        
    }