Merge branch 'testing' into prod-dev

d37bdbd3 · Administrator · a1b68438 · 407b96ab · d37bdbd3 · d37bdbd3
Commit d37bdbd3 authored May 12, 2015 by Administrator
7 changed files
--- a/parsing/FileParsers/EuropressFileParser.py
+++ b/parsing/FileParsers/EuropressFileParser.py
@@ -11,7 +11,7 @@ from ..NgramsExtractors import *
 from admin.utils import PrintException

 class EuropressFileParser(FileParser):
-  
+
    def _parse(self, file):

        localeEncoding = "fr_FR"
@@ -39,21 +39,21 @@ class EuropressFileParser(FileParser):
        try:
            html_parser = etree.HTMLParser(encoding=codif)
            html = etree.fromstring(contents, html_parser)
-            
+
            try :
-                
+
                format_europresse = 50
                html_articles = html.xpath('/html/body/table/tbody')

                if len(html_articles) < 1:
                    html_articles = html.xpath('/html/body/table')
-                    
+
                    if len(html_articles) < 1:
                        format_europresse = 1
                        html_articles = html.xpath('//div[@id="docContain"]')
            except :
                PrintException()
-            
+
            if format_europresse == 50 :
                name_xpath      = "./tr/td/span[@class = 'DocPublicationName']"
                header_xpath    = "./tr/td/span[@class = 'DocHeader']"
@@ -77,7 +77,7 @@ class EuropressFileParser(FileParser):
                        or self::td[@class='txtCertificat'] \
                        )]/text()"
                doi_xpath  = "//span[@id='ucPubliC_lblNodoc']/text()"
-                
+

        except Exception as error :
            PrintException()
@@ -85,9 +85,9 @@ class EuropressFileParser(FileParser):
        # parse all the articles, one by one
        try:
            for html_article in html_articles:
-                
+
                hyperdata = {}
-                
+
                if len(html_article):
                    for name in html_article.xpath(name_xpath):
                        if name.text is not None:
@@ -98,26 +98,26 @@ class EuropressFileParser(FileParser):
                                hyperdata['volume'] = test_journal.group(2)
                            else:
                                hyperdata['journal'] = name.text.encode(codif)
-                    
+
                    countbis = 0

                    for header in html_article.xpath(header_xpath):
 #                        print(count)
 #                        countbis += 1
-                        
+
 #                        try:
 #                            print('109', hyperdata['publication_date'])
 #                        except:
 #                            print('no date yet')
 #                            pass
-                        
+
                        try:
                            text = header.text
                            #print("header", text)
                        except Exception as error:
                            print(error)

-                        
+
                        if isinstance(text, bytes):
                            text = text.decode(encoding)
                        format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
@@ -134,9 +134,9 @@ class EuropressFileParser(FileParser):
                            test_date_en = None
                            test_sect = None
                            test_page = None
-                        
-                        
-                        
+
+
+
                        if test_date_fr is not None:
                            self.localeEncoding = "fr_FR"
                            locale.setlocale(locale.LC_ALL, localeEncoding)
@@ -158,7 +158,7 @@ class EuropressFileParser(FileParser):
                                    except Exception as error:
                                        print(error, text)
                                        pass
-                        
+

                        if test_date_en is not None:
                            localeEncoding = "en_GB.UTF-8"
@@ -173,20 +173,20 @@ class EuropressFileParser(FileParser):

                        if test_sect is not None:
                            hyperdata['section'] = test_sect.group(1).encode(codif)
-                        
+
                        if test_page is not None:
                            hyperdata['page'] = test_page.group(1).encode(codif)
-                    
-                    try:
-                        print('183', hyperdata['publication_date'])
-                    except:
-                        print('no date yet')
-                        pass
-                        
+
+#                    try:
+#                        print('183', hyperdata['publication_date'])
+#                    except:
+#                        print('no date yet')
+#                        pass
+#

                    hyperdata['title'] = html_article.xpath(title_xpath).encode(codif)
                    hyperdata['abstract']  = html_article.xpath(text_xpath)
-                   
+
                    line = 0
                    br_tag = 10
                    for i in html_articles[count].iter():
@@ -205,13 +205,13 @@ class EuropressFileParser(FileParser):
                                hyperdata['authors'] = 'not found'
                            line = 0
                            br_tag = 10
-                    
-                                       
+
+
                    try:
                        if hyperdata['publication_date'] is not None or hyperdata['publication_date'] != '':
                            try:
                                back = hyperdata['publication_date']
-                            except Exception as e: 
+                            except Exception as e:
                                #print(e)
                                pass
                        else:
@@ -226,14 +226,14 @@ class EuropressFileParser(FileParser):
                    #hyperdata['language_iso2'] = 'fr'
                    #elif lang == 'en':
                    #    hyperdata['language_iso2'] = 'en'
-                    
-                    
+
+
                    hyperdata['publication_year']  = hyperdata['publication_date'].strftime('%Y')
                    hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
                    hyperdata['publication_day']  = hyperdata['publication_date'].strftime('%d')
                    #hyperdata.pop('publication_date')
-                    
-                    if len(hyperdata['abstract'])>0 and format_europresse == 50: 
+
+                    if len(hyperdata['abstract'])>0 and format_europresse == 50:
                        hyperdata['doi'] = str(hyperdata['abstract'][-9])
                        hyperdata['abstract'].pop()
 # Here add separator for paragraphs
@@ -245,15 +245,15 @@ class EuropressFileParser(FileParser):
 # Here add separator for paragraphs
                        hyperdata['abstract'] = str(' '.join(hyperdata['abstract']))

-                    else: 
+                    else:
                        hyperdata['doi'] = "not found"
-                    
+
                    hyperdata['length_words'] = len(hyperdata['abstract'].split(' '))
                    hyperdata['length_letters'] = len(hyperdata['abstract'])
-                    
+
                    hyperdata['bdd']  = u'europresse'
                    hyperdata['url']  = u''
-                    
+
                  #hyperdata_str = {}
                    for key, value in hyperdata.items():
                        hyperdata[key] = value.decode() if isinstance(value, bytes) else value

--- a/parsing/FileParsers/FileParser.py
+++ b/parsing/FileParsers/FileParser.py
@@ -4,21 +4,21 @@ import zipfile
 import chardet

 from ..Caches import LanguagesCache
-    
+

 class FileParser:
    """Base class for performing files parsing depending on their type.
    """
    def __init__(self, language_cache=None):
        self._languages_cache = LanguagesCache() if language_cache is None else language_cache
-    
+
    def detect_encoding(self, string):
        """Useful method to detect the document encoding.
        """
        encoding = chardet.detect(string)
        return encoding.get('encoding', 'UTF-8')
-    
-    
+
+
    def format_hyperdata_dates(self, hyperdata):
        """Format the dates found in the hyperdata.
        Examples:
@@ -27,7 +27,7 @@ class FileParser:
            {"publication_year": "2014"}
            -> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
        """
-        
+
        # First, check the split dates...
        prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
        for prefix in prefixes:
@@ -51,21 +51,23 @@ class FileParser:
                hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
            except:
                pass
-        
+
        # ...then parse all the "date" fields, to parse it into separate elements
        prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_date"]
        for prefix in prefixes:
            date = dateutil.parser.parse(hyperdata[prefix + "_date"])
+            print('date')
+
            hyperdata[prefix + "_year"]      = date.strftime("%Y")
            hyperdata[prefix + "_month"]     = date.strftime("%m")
            hyperdata[prefix + "_day"]       = date.strftime("%d")
            hyperdata[prefix + "_hour"]      = date.strftime("%H")
            hyperdata[prefix + "_minute"]    = date.strftime("%M")
            hyperdata[prefix + "_second"]    = date.strftime("%S")
-                
+
        # finally, return the transformed result!
        return hyperdata
-        
+
    def format_hyperdata_languages(self, hyperdata):
        """format the languages found in the hyperdata."""
        language = None
@@ -81,18 +83,18 @@ class FileParser:
            hyperdata["language_iso3"]       = language.iso3
            hyperdata["language_fullname"]   = language.fullname
        return hyperdata
-        
+
    def format_hyperdata(self, hyperdata):
        """Format the hyperdata."""
        hyperdata = self.format_hyperdata_dates(hyperdata)
        hyperdata = self.format_hyperdata_languages(hyperdata)
        return hyperdata
-    
-    
+
+
    def _parse(self, file):
        """This method shall be overriden by inherited classes."""
        return list()
-        
+
    def parse(self, file):
        """Parse the file, and its children files found in the file.
        """

--- a/parsing/FileParsers/RisFileParser.py
+++ b/parsing/FileParsers/RisFileParser.py
@@ -3,15 +3,17 @@ from .FileParser import FileParser

 from ..Caches import LanguagesCache

+from admin.utils import PrintException
+
 class RisFileParser(FileParser):

    def __init__(self, language_cache=None):
-        
+
        super(FileParser, self).__init__()
        self._languages_cache = LanguagesCache() if language_cache is None else language_cache
-        
+
        self._begin = 6
-        
+
        self._parameters = {
            b"ER":  {"type": "delimiter"},
            b"TI":  {"type": "hyperdata", "key": "title", "separator": " "},
@@ -24,7 +26,7 @@ class RisFileParser(FileParser):
            b"AB":  {"type": "hyperdata", "key": "abstract", "separator": " "},
            b"WC":  {"type": "hyperdata", "key": "fields"},
        }
-        
+

    def _parse(self, file):
        hyperdata = {}
@@ -57,5 +59,11 @@ class RisFileParser(FileParser):
                    print(error)
        # if a hyperdata object is left in memory, yield it as well
        if hyperdata:
+#            try:
+#                if hyperdata['date_to_parse']:
+#                    print(hyperdata['date_to_parse'])
+#            except:
+#                pass
+#
            #print(hyperdata['title'])
            yield hyperdata
--- a/parsing/FileParsers/ZoteroFileParser.py
+++ b/parsing/FileParsers/ZoteroFileParser.py
+from .RisFileParser import RisFileParser
+
+from ..Caches import LanguagesCache
+
+class ZoteroFileParser(RisFileParser):
+    def __init__(self):
+        super(RisFileParser, self).__init__()
+
+        self._begin = 6
+
+        self._parameters = {
+            b"ER":  {"type": "delimiter"},
+            b"TI":  {"type": "hyperdata", "key": "title", "separator": " "},
+            b"AU":  {"type": "hyperdata", "key": "authors", "separator": ", "},
+            b"UR":  {"type": "hyperdata", "key": "doi"},
+            b"DA":  {"type": "hyperdata", "key": "publication_date"},
+            b"PY":  {"type": "hyperdata", "key": "publication_year"},
+            b"PD":  {"type": "hyperdata", "key": "publication_month"},
+            b"LA":  {"type": "hyperdata", "key": "language_iso2"},
+            b"AB":  {"type": "hyperdata", "key": "abstract", "separator": " "},
+            b"WC":  {"type": "hyperdata", "key": "fields"},
+        }
+
--- a/parsing/FileParsers/__init__.py
+++ b/parsing/FileParsers/__init__.py
 from .RisFileParser import RisFileParser
 from .IsiFileParser import IsiFileParser
 from .JstorFileParser import JstorFileParser
+from .ZoteroFileParser import ZoteroFileParser
 from .PubmedFileParser import PubmedFileParser
 from .EuropressFileParser import EuropressFileParser
 from .ISText import ISText
--- a/parsing/corpustools.py
+++ b/parsing/corpustools.py
@@ -128,7 +128,7 @@ def parse_resources(corpus, user=None, user_id=None):
            nodes.append(node)
            #
            # TODO: mark node-resources associations as parsed
-            # 
+            #
    dbg.show('insert %d documents' % len(nodes))
    session.add_all(nodes)
    session.commit()
@@ -205,7 +205,7 @@ def extract_ngrams(corpus, keys):
        language.id: language.iso2
        for language in session.query(Language)
    }
-    
+
    ngrams_data = set()
    ngrams_language_data = set()
    ngrams_tag_data = set()
@@ -241,7 +241,7 @@ def extract_ngrams(corpus, keys):
                        #tag_id   =  14
                        #print('tag_id_2', tag_id)
                    node_ngram_list[node_id][terms] += 1
-                    ngrams_data.add((n, terms))
+                    ngrams_data.add((n, terms[:255]))
                    ngrams_language_data.add((terms, language_id))
                    ngrams_tag_data.add((terms, tag_id))

@@ -268,7 +268,7 @@ def extract_ngrams(corpus, keys):
            ngram.terms = tmp__ngrams.terms
    ''' % (Ngram.__table__.name, ))
    # insert, then get the ids back
-    
+
    cursor.execute('''
        INSERT INTO
            %s (n, terms)
@@ -279,8 +279,8 @@ def extract_ngrams(corpus, keys):
        WHERE
            id IS NULL
    ''' % (Ngram.__table__.name, ))
-    
-    
+
+
    cursor.execute('''
        UPDATE
            tmp__ngrams
@@ -293,14 +293,14 @@ def extract_ngrams(corpus, keys):
        AND
            tmp__ngrams.id IS NULL
    ''' % (Ngram.__table__.name, ))
-    
+
    # get all ids
    ngram_ids = dict()
    cursor.execute('SELECT id, terms FROM tmp__ngrams')
    for row in cursor.fetchall():
        ngram_ids[row[1]] = row[0]
-    
-    # 
+
+    #
    dbg.show('insert associations')
    node_ngram_data = list()
    for node_id, ngrams in node_ngram_list.items():

--- a/parsing/parsers_config.py
+++ b/parsing/parsers_config.py
@@ -4,11 +4,11 @@ parsers = {
        'Pubmed (xml format)'               : PubmedFileParser,
        'Web of Science (ISI format)'       : IsiFileParser,
        'Scopus (RIS format)'               : RisFileParser,
-        'Zotero (RIS format)'               : JstorFileParser,
+        'Zotero (RIS format)'               : ZoteroFileParser,
        'Jstor (RIS format)'                : JstorFileParser,
        #'Europress'                        : EuropressFileParser,
        'Europress (French)'                : EuropressFileParser,
        'Europress (English)'               : EuropressFileParser,
-        
+
    }