[FEAT] EUROPRESSE PARSER FOR HTML5. still bug with zip files.

842efbe4 · delanoe · ecd6640d · 842efbe4 · 842efbe4 · 842efbe4
Commit 842efbe4 authored Nov 10, 2015 by delanoe
Showing with 401 additions and 265 deletions

tfidf.py ngram/tfidf.py +1 -1

EuropressFileParser.py parsing/FileParsers/EuropressFileParser.py +93 -264

EuropressFileParser_old.py parsing/FileParsers/EuropressFileParser_old.py +307 -0

No files found.
--- a/ngram/tfidf.py
+++ b/ngram/tfidf.py
@@ -238,7 +238,7 @@ def compute_tfidf_global(corpus):
        lnD = log(D)
        cursor.execute('UPDATE tmp__idf SET idf = idf + %f' % (lnD, ))
        # show off
-        dbg.show('insert tfidf for %d documents' % (D, ))
+        dbg.show('insert tfidf')
        cursor.execute('''
            INSERT INTO
                %s (nodex_id, nodey_id, ngram_id, score)

--- a/parsing/FileParsers/EuropressFileParser.py
+++ b/parsing/FileParsers/EuropressFileParser.py
 import re
 import locale
 from lxml import etree
+from lxml.etree import tostring
 from lxml.html import html5parser
+from itertools import chain
 from datetime import datetime, date
 from django.utils import timezone
 import dateutil.parser
 import dateparser
+import sys
+#sys.path.append('/srv/gargantext')
+#from admin.env import *
+#from parsing.FileParsers.FileParser import FileParser
 from .FileParser import FileParser
+#from parsing.NgramsExtractors import *
 from ..NgramsExtractors import *
 from admin.utils import PrintException
 class EuropressFileParser(FileParser):
    def _parse(self, file):
        localeEncoding = "fr_FR"
        codif      = "UTF-8"
-        count = 0
+        format_date = re.compile('.*\d{4}.*', re.UNICODE)
        if isinstance(file, str):
-            file = open(file, 'rb')
+            file_open = open(file, 'rb')
-        # print(file)
-        contents = file.read()
+        contents = file_open.read()
-        #print(len(contents))
-        #return []
        encoding = self.detect_encoding(contents)
-        #print(encoding)
        if encoding != "utf-8":
            try:
                contents = contents.decode("latin1", errors='replace').encode(codif)
            except:
                PrintException()
-#                try:
-#                    contents = contents.decode(encoding, errors='replace').encode(codif)
-#                except Exception as error:
-#                    print(error)
-        try:
-            html_parser = etree.HTMLParser(encoding=codif)
-            html = etree.fromstring(contents, html_parser)
-            try :
+        html_parser = etree.HTMLParser(encoding=codif)
+        html = etree.fromstring(contents, html_parser)
-                format_europresse = 50
+        html_parser = html5parser.etree.HTMLParser(encoding=codif)
-                html_articles = html.xpath('/html/body/table/tbody')
+        html = html5parser.etree.fromstring(contents, html_parser)
+        html_articles = html.xpath('//article')
-                if len(html_articles) < 1:
-                    html_articles = html.xpath('/html/body/table')
-                    if len(html_articles) < 1:
-                        format_europresse = 1
-                        html_articles = html.xpath('//div[@id="docContain"]')
-                        if len(html_articles) < 1 :
-                            format_europresse = 50.2
-                            html_parser = html5parser.etree.HTMLParser(encoding=codif)
-                            html = html5parser.etree.fromstring(contents, html_parser)
-                            html_articles = html.xpath('//article')
-                            if len(html_articles) < 1:
-                                print("no article found")
-            except :
-                PrintException()
-            if format_europresse == 50 :
-                name_xpath      = "./tr/td/span[@class = 'DocPublicationName']"
-                header_xpath    = "./tr/td/span[@class = 'DocHeader']"
-                title_xpath     = "string(./tr/td/span[@class = 'TitreArticleVisu'])"
-                text_xpath      = "./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
-            elif format_europresse == 1 :
-                name_xpath      = "//span[@class = 'DocPublicationName']"
-                header_xpath    = "//span[@class = 'DocHeader']"
-                title_xpath     = "string(//div[@class = 'titreArticleVisu'])"
-                text_xpath      = "./descendant::*[\
-                        not(\
-                           self::div[@class='Doc-SourceText'] \
-                        or self::span[@class='DocHeader'] \
-                        or self::span[@class='DocPublicationName'] \
-                        or self::span[@id='docNameVisu'] \
-                        or self::span[@class='DocHeader'] \
-                        or self::div[@class='titreArticleVisu'] \
-                        or self::span[@id='docNameContType'] \
-                        or descendant-or-self::span[@id='ucPubliC_lblCertificatIssuedTo'] \
-                        or descendant-or-self::span[@id='ucPubliC_lblEndDate'] \
-                        or self::td[@class='txtCertificat'] \
-                        )]/text()"
-                doi_xpath  = "//span[@id='ucPubliC_lblNodoc']/text()"
-            elif format_europresse == 50.2 :
-                name_xpath      = "./header/div/span[@class = 'DocPublicationName']"
-                header_xpath    = "./header/div/span[@class = 'DocHeader']"
-                title_xpath     = "string(./header/div/span[@class = 'TitreArticleVisu'])"
-                text_xpath      = "./header/div/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
+        name_xpath      = "./header/div/span[@class = 'DocPublicationName']"
+        header_xpath    = "./header/div/span[@class = 'DocHeader']"
+        title_xpath     = "./header/div[@class='titreArticle']/descendant-or-self::*"
+        text_xpath      = "./section/div[@class='DocText']/descendant-or-self::*"
-        except Exception as error :
+        def paragraph_list(data_xpath):
-            PrintException()
+            result = list()
+            for elem in data_xpath:
+                if elem.text is not None:
+                    if elem.text.strip() != '':
+                        if elem.tag == 'p':
+                            result.append(elem.text)
+                        else:
+                            if len(result) > 0:
+                                result.append(result.pop() + elem.text)
+                            else:
+                                result.append(elem.text)
+            return result
        # parse all the articles, one by one
        try:
            for html_article in html_articles:
                hyperdata = {}
-                if len(html_article):
+                try:
-                    for name in html_article.xpath(name_xpath):
+                    pub_name = html_article.xpath(name_xpath)[0].text
-                        #print("test name.text")
+                    name = pub_name.split(', ')
-                        if name.text is not None:
+                    hyperdata['journal']    =  name[0]
-                            #print(name.text)
+                    hyperdata['number']     =  name[1]
-                            format_journal = re.compile('(.*), (.*)', re.UNICODE)
+                except:
-                            test_journal = format_journal.match(name.text)
-                            if test_journal is not None:
-                                hyperdata['journal'] = test_journal.group(1)
-                                hyperdata['volume'] = test_journal.group(2)
-                            else:
-                                hyperdata['journal'] = name.text.encode(codif)
-                    countbis = 0
-                    for header in html_article.xpath(header_xpath):
-#                        print(count)
-#                        countbis += 1
-#                        try:
-#                            print('109', hyperdata['publication_date'])
-#                        except:
-#                            print('no date yet')
-#                            pass
-                        try:
-                            text = header.text
-                            print("header", text)
-                        except Exception as error:
-                            print(error)
-                        if isinstance(text, bytes):
-                            text = text.decode(encoding)
-                        if format_europresse == 50.2:
-                            # TODO here check the split if needed: 'Brest Ville, mercredi 26 novembre 2014'
-                            try:# # 2015-oct-08 exception added
-                                text = text.split(', ')[1] 
-                            except:
-                                pass
-                        format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
-                        format_date_fr_v2 = re.compile('\s*\w+\s+\d+\s+\w+\s+\d{4}', re.UNICODE)
-                        if text is not None:
-                            test_date_fr = format_date_fr.match(text)
-                            #TODO check the v2 format here
-                            test_date_fr_v2 = format_date_fr_v2.match(text)
-                            format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE)
-                            test_date_en = format_date_en.match(text)
-                            format_sect = re.compile('(\D+),', re.UNICODE)
-                            test_sect = format_sect.match(text)
-                            format_page = re.compile(', p. (\w+)', re.UNICODE)
-                            test_page = format_page.match(text)
-                        else:
-                            test_date_fr = None
-                            test_date_fr_v2 = None
-                            test_date_en = None
-                            test_sect = None
-                            test_page = None
-                        if test_date_fr is not None or test_date_fr_v2 is not None:
-                            self.localeEncoding = "fr_FR"
-                            locale.setlocale(locale.LC_ALL, "fr_FR.utf-8")
-                            if encoding != "utf-8":
-                                text = text.replace('י', 'é')
-                                text = text.replace('ű', 'û')
-                                text = text.replace(' aot ', ' août ')
-                            try:
-                                hyperdata['publication_date'] = dateparser.parse(text, languages=['fr'])
-                            except:
-                                try :
-                                    hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
-                                except :
-                                    try:
-                                        hyperdata['publication_date'] = datetime.strptime(text, '%B %Y')
-                                    except :
-                                        try:
-                                            locale.setlocale(locale.LC_ALL, "fr_FR")
-                                            hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
-                                            # hyperdata['publication_date'] = dateutil.parser.parse(text)
-                                        except :
-                                            # TODO format to parse: ' mercredi 26 novembre 2014'
-                                            try :
-                                                hyperdata['publication_date'] = datetime.strptime(text, ' %A %d %B %Y')
-                                            except Exception as error:
-                                                print(error, text)
-                                                pass
-                        if test_date_en is not None:
-                            localeEncoding = "en_GB.UTF-8"
-                            locale.setlocale(locale.LC_ALL, localeEncoding)
-                            try :
-                                hyperdata['publication_date'] = datetime.strptime(text, '%B %d, %Y')
-                            except :
-                                try :
-                                    hyperdata['publication_date'] = datetime.strptime(text, '%B %Y')
-                                except :
-                                    pass
-                        if test_sect is not None:
-                            hyperdata['section'] = test_sect.group(1).encode(codif)
-                        if test_page is not None:
-                            hyperdata['page'] = test_page.group(1).encode(codif)
-#                    try:
-#                        print('183', hyperdata['publication_date'])
-#                    except:
-#                        print('no date yet')
-#                        pass
-#
-                    hyperdata['title'] = html_article.xpath(title_xpath).encode(codif)
-                    hyperdata['abstract']  = html_article.xpath(text_xpath)
-                    line = 0
-                    br_tag = 10
-                    for i in html_articles[count].iter():
-                       # print line, br, i, i.tag, i.attrib, i.tail
-                        if i.tag == "span":
-                            if "class" in i.attrib:
-                                if i.attrib['class'] == 'TitreArticleVisu':
-                                    line = 1
-                                    br_tag = 2
-                        if line == 1 and i.tag == "br":
-                            br_tag -= 1
-                        if line == 1 and br_tag == 0:
-                            try:
-                                hyperdata['authors'] = str.title(etree.tostring(i, method="text", encoding=codif)).encode(codif)#.split(';')
-                            except:
-                                hyperdata['authors'] = 'not found'
-                            line = 0
-                            br_tag = 10
                    try:
-                        if hyperdata['publication_date'] is not None or hyperdata['publication_date'] != '':
+                        hyperdata['journal']    =  pub_name.strip()
-                            try:
+                    except:
-                                back = hyperdata['publication_date']
+                        pass
-                            except Exception as e:
-                                #print(e)
-                                pass
+                header = html_article.xpath(header_xpath)[0].text
-                        else:
+                if header is not None:
-                            try:
+                    header = header.split(', ')
-                                hyperdata['publication_date'] = back
+                    if format_date.match(header[0]):
-                            except Exception as e:
+                        date       = header[0]
-                                print(e)
-                    except :
-                        hyperdata['publication_date'] = timezone.now()
-                    #if lang == 'fr':
-                    #hyperdata['language_iso2'] = 'fr'
-                    #elif lang == 'en':
-                    #    hyperdata['language_iso2'] = 'en'
-                    hyperdata['publication_year']  = hyperdata['publication_date'].strftime('%Y')
-                    hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
-                    hyperdata['publication_day']  = hyperdata['publication_date'].strftime('%d')
-                    #hyperdata.pop('publication_date')
-                    if len(hyperdata['abstract'])>0 and format_europresse == 50:
-                        hyperdata['doi'] = str(hyperdata['abstract'][-9])
-                        hyperdata['abstract'].pop()
-# Here add separator for paragraphs
-                        hyperdata['abstract'] = str(' '.join(hyperdata['abstract']))
-                        hyperdata['abstract'] = str(re.sub('Tous droits réservés.*$', '', hyperdata['abstract']))
-                    elif format_europresse == 1:
-                        hyperdata['doi'] = ' '.join(html_article.xpath(doi_xpath))
-                        hyperdata['abstract'] = hyperdata['abstract'][:-9]
-# Here add separator for paragraphs
-                        hyperdata['abstract'] = str(' '.join(hyperdata['abstract']))
                    else:
-                        hyperdata['doi'] = "not found"
+                        hyperdata['rubrique']   = header[0]
+                        date       = header[1]
-                    # try:
-                    #     hyperdata['length_words'] = len(hyperdata['abstract'].split(' '))
-                    # except:
-                    #     PrintException()
-                    hyperdata['length_letters'] = len(hyperdata['abstract'])
+                    try:
+                        hyperdata['page']       = header[2].split(' ')[1]
+                    except:
+                        pass
+                try:
+                    hyperdata['publication_date'] = dateparser.parse(date, languages=['fr', 'en'])
+                except:
+                    hyperdata['publication_date'] = timezone.now()
+                try:
+                    title   = paragraph_list(html_article.xpath(title_xpath))
+                    hyperdata['title'] = title[0]
+                except:
+                    pass
+                try:
+                    text    = paragraph_list(html_article.xpath(text_xpath))
+                    hyperdata['abstract'] = ' '.join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
+                except:
+                    pass
+                yield hyperdata
+            file_open.close()
+        except :
+            PrintException()
+            pass
-                    hyperdata['bdd']  = u'europresse'
+if __name__ == "__main__":
-                    hyperdata['url']  = u''
+    e = EuropressFileParser()
+    hyperdata = e.parse(str(sys.argv[1]))
+    for h in hyperdata:
+        try:
+            print(h['journal'], ":", h['publication_date'])
+        except:
+            pass
-                  #hyperdata_str = {}
-                    for key, value in hyperdata.items():
-                        hyperdata[key] = value.decode() if isinstance(value, bytes) else value
-                    yield hyperdata
-                    count += 1
-            file.close()
-        except Exception as error:
-            print(error)
-            pass
--- a/parsing/FileParsers/EuropressFileParser_old.py
+++ b/parsing/FileParsers/EuropressFileParser_old.py
+import re
+import locale
+from lxml import etree
+from lxml.html import html5parser
+from datetime import datetime, date
+from django.utils import timezone
+import dateutil.parser
+import dateparser
+from .FileParser import FileParser
+from ..NgramsExtractors import *
+from admin.utils import PrintException
+class EuropressFileParser(FileParser):
+    def _parse(self, file):
+        localeEncoding = "fr_FR"
+        codif      = "UTF-8"
+        count = 0
+        if isinstance(file, str):
+            file = open(file, 'rb')
+        # print(file)
+        contents = file.read()
+        #print(len(contents))
+        #return []
+        encoding = self.detect_encoding(contents)
+        #print(encoding)
+        if encoding != "utf-8":
+            try:
+                contents = contents.decode("latin1", errors='replace').encode(codif)
+            except:
+                PrintException()
+#                try:
+#                    contents = contents.decode(encoding, errors='replace').encode(codif)
+#                except Exception as error:
+#                    print(error)
+        try:
+            html_parser = etree.HTMLParser(encoding=codif)
+            html = etree.fromstring(contents, html_parser)
+            try :
+                format_europresse = 50
+                html_articles = html.xpath('/html/body/table/tbody')
+                if len(html_articles) < 1:
+                    html_articles = html.xpath('/html/body/table')
+                    if len(html_articles) < 1:
+                        format_europresse = 1
+                        html_articles = html.xpath('//div[@id="docContain"]')
+                        if len(html_articles) < 1 :
+                            format_europresse = 50.2
+                            html_parser = html5parser.etree.HTMLParser(encoding=codif)
+                            html = html5parser.etree.fromstring(contents, html_parser)
+                            html_articles = html.xpath('//article')
+                            if len(html_articles) < 1:
+                                print("no article found")
+            except :
+                PrintException()
+            if format_europresse == 50 :
+                name_xpath      = "./tr/td/span[@class = 'DocPublicationName']"
+                header_xpath    = "./tr/td/span[@class = 'DocHeader']"
+                title_xpath     = "string(./tr/td/span[@class = 'TitreArticleVisu'])"
+                text_xpath      = "./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
+            elif format_europresse == 1 :
+                name_xpath      = "//span[@class = 'DocPublicationName']"
+                header_xpath    = "//span[@class = 'DocHeader']"
+                title_xpath     = "string(//div[@class = 'titreArticleVisu'])"
+                text_xpath      = "./descendant::*[\
+                        not(\
+                           self::div[@class='Doc-SourceText'] \
+                        or self::span[@class='DocHeader'] \
+                        or self::span[@class='DocPublicationName'] \
+                        or self::span[@id='docNameVisu'] \
+                        or self::span[@class='DocHeader'] \
+                        or self::div[@class='titreArticleVisu'] \
+                        or self::span[@id='docNameContType'] \
+                        or descendant-or-self::span[@id='ucPubliC_lblCertificatIssuedTo'] \
+                        or descendant-or-self::span[@id='ucPubliC_lblEndDate'] \
+                        or self::td[@class='txtCertificat'] \
+                        )]/text()"
+                doi_xpath  = "//span[@id='ucPubliC_lblNodoc']/text()"
+            elif format_europresse == 50.2 :
+                name_xpath      = "./header/div/span[@class = 'DocPublicationName']"
+                header_xpath    = "./header/div/span[@class = 'DocHeader']"
+                title_xpath     = "string(./header/div/span[@class = 'TitreArticleVisu'])"
+                text_xpath      = "./header/div/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
+        except Exception as error :
+            PrintException()
+        # parse all the articles, one by one
+        try:
+            for html_article in html_articles:
+                hyperdata = {}
+                if len(html_article):
+                    for name in html_article.xpath(name_xpath):
+                        #print("test name.text")
+                        if name.text is not None:
+                            #print(name.text)
+                            format_journal = re.compile('(.*), (.*)', re.UNICODE)
+                            test_journal = format_journal.match(name.text)
+                            if test_journal is not None:
+                                hyperdata['journal'] = test_journal.group(1)
+                                hyperdata['volume'] = test_journal.group(2)
+                            else:
+                                hyperdata['journal'] = name.text.encode(codif)
+                    countbis = 0
+                    for header in html_article.xpath(header_xpath):
+#                        print(count)
+#                        countbis += 1
+#                        try:
+#                            print('109', hyperdata['publication_date'])
+#                        except:
+#                            print('no date yet')
+#                            pass
+                        try:
+                            text = header.text
+                            print("header", text)
+                        except Exception as error:
+                            print(error)
+                        if isinstance(text, bytes):
+                            text = text.decode(encoding)
+                        if format_europresse == 50.2:
+                            # TODO here check the split if needed: 'Brest Ville, mercredi 26 novembre 2014'
+                            try:# # 2015-oct-08 exception added
+                                text = text.split(', ')[1] 
+                            except:
+                                pass
+                        format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
+                        format_date_fr_v2 = re.compile('\s*\w+\s+\d+\s+\w+\s+\d{4}', re.UNICODE)
+                        if text is not None:
+                            test_date_fr = format_date_fr.match(text)
+                            #TODO check the v2 format here
+                            test_date_fr_v2 = format_date_fr_v2.match(text)
+                            format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE)
+                            test_date_en = format_date_en.match(text)
+                            format_sect = re.compile('(\D+),', re.UNICODE)
+                            test_sect = format_sect.match(text)
+                            format_page = re.compile(', p. (\w+)', re.UNICODE)
+                            test_page = format_page.match(text)
+                        else:
+                            test_date_fr = None
+                            test_date_fr_v2 = None
+                            test_date_en = None
+                            test_sect = None
+                            test_page = None
+                        if test_date_fr is not None or test_date_fr_v2 is not None:
+                            self.localeEncoding = "fr_FR"
+                            locale.setlocale(locale.LC_ALL, "fr_FR.utf-8")
+                            if encoding != "utf-8":
+                                text = text.replace('י', 'é')
+                                text = text.replace('ű', 'û')
+                                text = text.replace(' aot ', ' août ')
+                            try:
+                                hyperdata['publication_date'] = dateparser.parse(text, languages=['fr'])
+                            except:
+                                try :
+                                    hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
+                                except :
+                                    try:
+                                        hyperdata['publication_date'] = datetime.strptime(text, '%B %Y')
+                                    except :
+                                        try:
+                                            locale.setlocale(locale.LC_ALL, "fr_FR")
+                                            hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
+                                            # hyperdata['publication_date'] = dateutil.parser.parse(text)
+                                        except :
+                                            # TODO format to parse: ' mercredi 26 novembre 2014'
+                                            try :
+                                                hyperdata['publication_date'] = datetime.strptime(text, ' %A %d %B %Y')
+                                            except Exception as error:
+                                                print(error, text)
+                                                pass
+                        if test_date_en is not None:
+                            localeEncoding = "en_GB.UTF-8"
+                            locale.setlocale(locale.LC_ALL, localeEncoding)
+                            try :
+                                hyperdata['publication_date'] = datetime.strptime(text, '%B %d, %Y')
+                            except :
+                                try :
+                                    hyperdata['publication_date'] = datetime.strptime(text, '%B %Y')
+                                except :
+                                    pass
+                        if test_sect is not None:
+                            hyperdata['section'] = test_sect.group(1).encode(codif)
+                        if test_page is not None:
+                            hyperdata['page'] = test_page.group(1).encode(codif)
+#                    try:
+#                        print('183', hyperdata['publication_date'])
+#                    except:
+#                        print('no date yet')
+#                        pass
+#
+                    hyperdata['title'] = html_article.xpath(title_xpath).encode(codif)
+                    hyperdata['abstract']  = html_article.xpath(text_xpath)
+                    line = 0
+                    br_tag = 10
+                    for i in html_articles[count].iter():
+                       # print line, br, i, i.tag, i.attrib, i.tail
+                        if i.tag == "span":
+                            if "class" in i.attrib:
+                                if i.attrib['class'] == 'TitreArticleVisu':
+                                    line = 1
+                                    br_tag = 2
+                        if line == 1 and i.tag == "br":
+                            br_tag -= 1
+                        if line == 1 and br_tag == 0:
+                            try:
+                                hyperdata['authors'] = str.title(etree.tostring(i, method="text", encoding=codif)).encode(codif)#.split(';')
+                            except:
+                                hyperdata['authors'] = 'not found'
+                            line = 0
+                            br_tag = 10
+                    try:
+                        if hyperdata['publication_date'] is not None or hyperdata['publication_date'] != '':
+                            try:
+                                back = hyperdata['publication_date']
+                            except Exception as e:
+                                #print(e)
+                                pass
+                        else:
+                            try:
+                                hyperdata['publication_date'] = back
+                            except Exception as e:
+                                print(e)
+                    except :
+                        hyperdata['publication_date'] = timezone.now()
+                    #if lang == 'fr':
+                    #hyperdata['language_iso2'] = 'fr'
+                    #elif lang == 'en':
+                    #    hyperdata['language_iso2'] = 'en'
+                    hyperdata['publication_year']  = hyperdata['publication_date'].strftime('%Y')
+                    hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
+                    hyperdata['publication_day']  = hyperdata['publication_date'].strftime('%d')
+                    #hyperdata.pop('publication_date')
+                    if len(hyperdata['abstract'])>0 and format_europresse == 50:
+                        hyperdata['doi'] = str(hyperdata['abstract'][-9])
+                        hyperdata['abstract'].pop()
+# Here add separator for paragraphs
+                        hyperdata['abstract'] = str(' '.join(hyperdata['abstract']))
+                        hyperdata['abstract'] = str(re.sub('Tous droits réservés.*$', '', hyperdata['abstract']))
+                    elif format_europresse == 1:
+                        hyperdata['doi'] = ' '.join(html_article.xpath(doi_xpath))
+                        hyperdata['abstract'] = hyperdata['abstract'][:-9]
+# Here add separator for paragraphs
+                        hyperdata['abstract'] = str(' '.join(hyperdata['abstract']))
+                    else:
+                        hyperdata['doi'] = "not found"
+                    # try:
+                    #     hyperdata['length_words'] = len(hyperdata['abstract'].split(' '))
+                    # except:
+                    #     PrintException()
+                    hyperdata['length_letters'] = len(hyperdata['abstract'])
+                    hyperdata['bdd']  = u'europresse'
+                    hyperdata['url']  = u''
+                  #hyperdata_str = {}
+                    for key, value in hyperdata.items():
+                        hyperdata[key] = value.decode() if isinstance(value, bytes) else value
+                    yield hyperdata
+                    count += 1
+            file.close()
+        except Exception as error:
+            print(error)
+            pass