FileParser.py 6.02 KB
import collections
import datetime
import dateutil.parser
import zipfile
import chardet
import re

from ..Caches import LanguagesCache


DEFAULT_DATE = datetime.datetime(datetime.MINYEAR, 1, 1)


class FileParser:
    """Base class for performing files parsing depending on their type.
    """
    def __init__(self, language_cache=None):
        self._languages_cache = LanguagesCache() if language_cache is None else language_cache

    def detect_encoding(self, string):
        """Useful method to detect the document encoding.
        """
        encoding = chardet.detect(string)
        return encoding.get('encoding', 'UTF-8')


    def format_hyperdata_dates(self, hyperdata):
        """Format the dates found in the hyperdata.
        Examples:
            {"publication_date": "2014-10-23 09:57:42"}
            -> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014", ...}
            {"publication_year": "2014"}
            -> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
        """

        # First, check the split dates...
        # This part mainly deal with Zotero data but can be usefull for others
        # parts
        date_string = hyperdata.get('publication_date_to_parse', None)
        if date_string is not None:
            date_string = re.sub(r'\/\/+(\w*|\d*)', '', date_string)
            #date_string = re.sub(r'undefined', '', date_string)
            try:
                hyperdata['publication' + "_date"] = dateutil.parser.parse(
                    date_string,
                    default=DEFAULT_DATE
                ).strftime("%Y-%m-%d %H:%M:%S")
            except Exception as error:
                print(error, 'Parser Zotero, Date not parsed for:', date_string)
                hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")


        elif hyperdata.get('publication_year', None) is not None:
            prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
            for prefix in prefixes:
                date_string = hyperdata[prefix + "_year"]
                key = prefix + "_month"
                if key in hyperdata:
                    date_string += " " + hyperdata[key]
                    key = prefix + "_day"
                    if key in hyperdata:
                        date_string += " " + hyperdata[key]
                        key = prefix + "_hour"
                        if key in hyperdata:
                            date_string += " " + hyperdata[key]
                            key = prefix + "_minute"
                            if key in hyperdata:
                                date_string += ":" + hyperdata[key]
                                key = prefix + "_second"
                                if key in hyperdata:
                                    date_string += ":" + hyperdata[key]
                try:
                    hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
                except:
                    pass
        else:
            hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        # ...then parse all the "date" fields, to parse it into separate elements
        prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"]
        for prefix in prefixes:
            date = dateutil.parser.parse(hyperdata[prefix + "_date"])
            #print(date)

            hyperdata[prefix + "_year"]      = date.strftime("%Y")
            hyperdata[prefix + "_month"]     = date.strftime("%m")
            hyperdata[prefix + "_day"]       = date.strftime("%d")
            hyperdata[prefix + "_hour"]      = date.strftime("%H")
            hyperdata[prefix + "_minute"]    = date.strftime("%M")
            hyperdata[prefix + "_second"]    = date.strftime("%S")

        # finally, return the transformed result!
        return hyperdata
        print(hyperdata['publication_date'])

    def format_hyperdata_languages(self, hyperdata):
        """format the languages found in the hyperdata."""
        language = None
        for key in ["fullname", "iso3", "iso2"]:
            language_key = "language_" + key
            if language_key in hyperdata:
                language_symbol = hyperdata[language_key]
                language = self._languages_cache[language_symbol]
                if language:
                    break
        if language:
            hyperdata["language_iso2"]       = language.iso2
            hyperdata["language_iso3"]       = language.iso3
            hyperdata["language_fullname"]   = language.fullname
        return hyperdata

    def format_hyperdata(self, hyperdata):
        """Format the hyperdata."""
        hyperdata = self.format_hyperdata_dates(hyperdata)
        hyperdata = self.format_hyperdata_languages(hyperdata)
        return hyperdata


    def _parse(self, file):
        """This method shall be overriden by inherited classes."""
        return list()

    def parse(self, file):
        """Parse the file, and its children files found in the file.
        """
        # initialize the list of hyperdata
        hyperdata_list = []
        if zipfile.is_zipfile(file):
            #print(file, "# is the file is a ZIP archive, recurse on each of its files...")
            zipArchive = zipfile.ZipFile(file)
            for filename in zipArchive.namelist():
                try:
                    f = zipArchive.open(filename, 'r')
                    hyperdata_list += self.parse(f)
                    f.close()
                except Exception as error:
                    print(error)
        # ...otherwise, let's parse it directly!
        else:
            #print(file, "it is not a zip file")
            try:
                for hyperdata in self._parse(file):
                    hyperdata_list.append(self.format_hyperdata(hyperdata))
                if hasattr(file, 'close'):
                    file.close()
            except Exception as error:
                print(error)
        # return the list of formatted hyperdata
        return hyperdata_list