[BUG FIX] Adding try/except for ugly encoded corpora.

6a2e10fb · Administrator · 0c97f772 · 6a2e10fb
Commit 6a2e10fb authored Jan 08, 2015 by Administrator
Hide whitespace changes
Inline Side-by-side

Showing with 136 additions and 127 deletions

EuropressFileParser.py parsing/FileParsers/EuropressFileParser.py +136 -127

No files found.
--- a/parsing/FileParsers/EuropressFileParser.py
+++ b/parsing/FileParsers/EuropressFileParser.py
@@ -42,141 +42,150 @@ class EuropressFileParser(FileParser):
        # initialize the list of metadata
        metadata_list = []
        # parse all the articles, one by one
+        try:
+            for html_article in html_articles:
+                
+                metadata = {}
+                
+                if len(html_article):
+                    for name in html_article.xpath("./tr/td/span[@class = 'DocPublicationName']"):
+                        if name.text is not None:
+                            format_journal = re.compile('(.*), (.*)', re.UNICODE)
+                            test_journal = format_journal.match(name.text)
+                            if test_journal is not None:
+                                metadata['source'] = test_journal.group(1)
+                                metadata['volume'] = test_journal.group(2)
+                            else:
+                                metadata['source'] = name.text.encode(codif)
+
+                    for header in html_article.xpath("./tr/td/span[@class = 'DocHeader']"):
+                        try:
+                            text = header.text
+                        except Exception as error:
+                            print(error)
+
+                        
+                        if isinstance(text, bytes):
+                            text = text.decode(encoding)
+
+                        format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
+                        test_date_fr = format_date_fr.match(text)
+                        
+                        format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE)
+                        test_date_en = format_date_en.match(text)
+
+                        format_sect = re.compile('(\D+),', re.UNICODE)
+                        test_sect = format_sect.match(text)
+                        
+                        format_page = re.compile(', p. (\w+)', re.UNICODE)
+                        test_page = format_page.match(text)
+                        
+                        if test_date_fr is not None:
+                            self.localeEncoding = "fr_FR"
+                            locale.setlocale(locale.LC_ALL, localeEncoding)
+                            if encoding != "utf-8":
+                                text = text.replace('י', 'é')
+                                text = text.replace('ű', 'û')
+                                text = text.replace(' aot ', ' août ')

-        for html_article in html_articles:
-            
-            metadata = {}
-            
-            if len(html_article):
-                for name in html_article.xpath("./tr/td/span[@class = 'DocPublicationName']"):
-                    if name.text is not None:
-                        format_journal = re.compile('(.*), (.*)', re.UNICODE)
-                        test_journal = format_journal.match(name.text)
-                        if test_journal is not None:
-                            metadata['source'] = test_journal.group(1)
-                            metadata['volume'] = test_journal.group(2)
-                        else:
-                            metadata['source'] = name.text.encode(codif)
-
-                for header in html_article.xpath("./tr/td/span[@class = 'DocHeader']"):
-                    text = header.text
-                    if isinstance(text, bytes):
-                        text = text.decode(encoding)
-
-                    format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
-                    test_date_fr = format_date_fr.match(text)
-                    
-                    format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE)
-                    test_date_en = format_date_en.match(text)

-                    format_sect = re.compile('(\D+),', re.UNICODE)
-                    test_sect = format_sect.match(text)
-                    
-                    format_page = re.compile(', p. (\w+)', re.UNICODE)
-                    test_page = format_page.match(text)
-                    
-                    if test_date_fr is not None:
-                        self.localeEncoding = "fr_FR"
-                        locale.setlocale(locale.LC_ALL, localeEncoding)
-                        if encoding != "utf-8":
-                            text = text.replace('י', 'é')
-                            text = text.replace('ű', 'û')
-                            text = text.replace(' aot ', ' août ')
-
-
-                        try :
-                            metadata['publication_date'] = datetime.strptime(text, '%d %B %Y')
-                        except :
-                            try:
-                                metadata['publication_date'] = datetime.strptime(text, '%B %Y')
+                            try :
+                                metadata['publication_date'] = datetime.strptime(text, '%d %B %Y')
                            except :
                                try:
-                                    metadata['publication_date'] = dateutil.parser.parse(text)
-                                except Exception as error:
-                                    print(error)
-                                    print(text)
-                                    pass
-                    
-                    if test_date_en is not None:
-                        localeEncoding = "en_GB.UTF-8"
-                        locale.setlocale(locale.LC_ALL, localeEncoding)
-                        try :
-                            metadata['publication_date'] = datetime.strptime(text, '%B %d, %Y')
-                        except :
+                                    metadata['publication_date'] = datetime.strptime(text, '%B %Y')
+                                except :
+                                    try:
+                                        metadata['publication_date'] = dateutil.parser.parse(text)
+                                    except Exception as error:
+                                        print(error)
+                                        print(text)
+                                        pass
+                        
+                        if test_date_en is not None:
+                            localeEncoding = "en_GB.UTF-8"
+                            locale.setlocale(locale.LC_ALL, localeEncoding)
                            try :
-                                metadata['publication_date'] = datetime.strptime(text, '%B %Y')
+                                metadata['publication_date'] = datetime.strptime(text, '%B %d, %Y')
                            except :
-                                pass
+                                try :
+                                    metadata['publication_date'] = datetime.strptime(text, '%B %Y')
+                                except :
+                                    pass

-                    if test_sect is not None:
-                        metadata['section'] = test_sect.group(1).encode(codif)
+                        if test_sect is not None:
+                            metadata['section'] = test_sect.group(1).encode(codif)
+                        
+                        if test_page is not None:
+                            metadata['page'] = test_page.group(1).encode(codif)
+
+                    metadata['title'] = html_article.xpath("string(./tr/td/span[@class = 'TitreArticleVisu'])").encode(codif)
+                    metadata['text']  = html_article.xpath("./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()")
+                   
+                    line = 0
+                    br_tag = 10
+                    for i in html_articles[count].iter():
+                       # print line, br, i, i.tag, i.attrib, i.tail
+                        if i.tag == "span":
+                            if "class" in i.attrib:
+                                if i.attrib['class'] == 'TitreArticleVisu':
+                                    line = 1
+                                    br_tag = 2
+                        if line == 1 and i.tag == "br":
+                            br_tag -= 1
+                        if line == 1 and br_tag == 0:
+                            try:
+                                metadata['authors'] = str.title(etree.tostring(i, method="text", encoding=codif)).encode(codif)#.split(';')
+                            except:
+                                metadata['authors'] = 'not found'
+                            line = 0
+                            br_tag = 10
                    
-                    if test_page is not None:
-                        metadata['page'] = test_page.group(1).encode(codif)
-
-                metadata['title'] = html_article.xpath("string(./tr/td/span[@class = 'TitreArticleVisu'])").encode(codif)
-                metadata['text']  = html_article.xpath("./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()")
-               
-                line = 0
-                br_tag = 10
-                for i in html_articles[count].iter():
-                   # print line, br, i, i.tag, i.attrib, i.tail
-                    if i.tag == "span":
-                        if "class" in i.attrib:
-                            if i.attrib['class'] == 'TitreArticleVisu':
-                                line = 1
-                                br_tag = 2
-                    if line == 1 and i.tag == "br":
-                        br_tag -= 1
-                    if line == 1 and br_tag == 0:
-                        try:
-                            metadata['authors'] = str.title(etree.tostring(i, method="text", encoding=codif)).encode(codif)#.split(';')
-                        except:
-                            metadata['authors'] = 'not found'
-                        line = 0
-                        br_tag = 10
-                
-                
-                try:
-                    if metadata['publication_date'] is not None or metadata['publication_date'] != '':
-                        try:
-                            back = metadata['publication_date']
-                        except Exception as e: 
-                            #print(e)
-                            pass
-                    else:
-                        try:
-                            metadata['publication_date'] = back
-                        except Exception as e:
-                            print(e)
-                except :
-                    metadata['publication_date'] = timezone.now()
-
-                #if lang == 'fr':
-                #metadata['language_iso2'] = 'fr'
-                #elif lang == 'en':
-                #    metadata['language_iso2'] = 'en'
-                
-                
-                metadata['publication_year']  = metadata['publication_date'].strftime('%Y')
-                metadata['publication_month'] = metadata['publication_date'].strftime('%m')
-                metadata['publication_day']  = metadata['publication_date'].strftime('%d')
-                metadata['publication_date'] = ""
-
-                metadata['object_id'] = str(metadata['text'][-9])
-                metadata['text'].pop()
-                metadata['text'] = str(' '.join(metadata['text']))
-                metadata['text'] = str(re.sub('Tous droits réservés.*$', '', metadata['text']))
-
-                metadata['bdd']  = u'europresse'
-                metadata['url']  = u''
-                
-              #metadata_str = {}
-                for key, value in metadata.items():
-                    metadata[key] = value.decode() if isinstance(value, bytes) else value
-                metadata_list.append(metadata)
-                count += 1
-         
+                    
+                    try:
+                        if metadata['publication_date'] is not None or metadata['publication_date'] != '':
+                            try:
+                                back = metadata['publication_date']
+                            except Exception as e: 
+                                #print(e)
+                                pass
+                        else:
+                            try:
+                                metadata['publication_date'] = back
+                            except Exception as e:
+                                print(e)
+                    except :
+                        metadata['publication_date'] = timezone.now()
+
+                    #if lang == 'fr':
+                    #metadata['language_iso2'] = 'fr'
+                    #elif lang == 'en':
+                    #    metadata['language_iso2'] = 'en'
+                    
+                    
+                    metadata['publication_year']  = metadata['publication_date'].strftime('%Y')
+                    metadata['publication_month'] = metadata['publication_date'].strftime('%m')
+                    metadata['publication_day']  = metadata['publication_date'].strftime('%d')
+                    metadata['publication_date'] = ""
+
+                    metadata['object_id'] = str(metadata['text'][-9])
+                    metadata['text'].pop()
+                    metadata['text'] = str(' '.join(metadata['text']))
+                    metadata['text'] = str(re.sub('Tous droits réservés.*$', '', metadata['text']))
+
+                    metadata['bdd']  = u'europresse'
+                    metadata['url']  = u''
+                    
+                  #metadata_str = {}
+                    for key, value in metadata.items():
+                        metadata[key] = value.decode() if isinstance(value, bytes) else value
+                    metadata_list.append(metadata)
+                    count += 1
+        
+        except Exception as error:
+            print(error)
+            pass
+
 #       from pprint import pprint
 #       pprint(metadata_list)
 #       return []