Commit 569d42d8 authored by Mathieu Rodic's avatar Mathieu Rodic

[BUGFIX] If PubMed abstract is made of many parts, all of them are concatenated now

parent 35eede53
......@@ -3,6 +3,8 @@ from lxml import etree
from .FileParser import FileParser
from ..NgramsExtractors import *
from collections import defaultdict
class PubmedFileParser(FileParser):
def _parse(self, file):
......@@ -13,7 +15,7 @@ class PubmedFileParser(FileParser):
# parse all the articles, one by one
for xml_article in xml_articles:
# extract data from the document
metadata = {}
metadata = defaultdict(str)
metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle',
......@@ -34,7 +36,9 @@ class PubmedFileParser(FileParser):
for xml_author in xml_node
])
else:
metadata[key] = xml_node.text
if metadata[key]:
metadata[key] += '\n'
metadata[key] += xml_node.text
except:
pass
yield metadata
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment