Commit 569d42d8 authored by Mathieu Rodic's avatar Mathieu Rodic

[BUGFIX] If PubMed abstract is made of many parts, all of them are concatenated now

parent 35eede53
...@@ -3,6 +3,8 @@ from lxml import etree ...@@ -3,6 +3,8 @@ from lxml import etree
from .FileParser import FileParser from .FileParser import FileParser
from ..NgramsExtractors import * from ..NgramsExtractors import *
from collections import defaultdict
class PubmedFileParser(FileParser): class PubmedFileParser(FileParser):
def _parse(self, file): def _parse(self, file):
...@@ -13,7 +15,7 @@ class PubmedFileParser(FileParser): ...@@ -13,7 +15,7 @@ class PubmedFileParser(FileParser):
# parse all the articles, one by one # parse all the articles, one by one
for xml_article in xml_articles: for xml_article in xml_articles:
# extract data from the document # extract data from the document
metadata = {} metadata = defaultdict(str)
metadata_path = { metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title', "journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle', "title" : 'MedlineCitation/Article/ArticleTitle',
...@@ -34,7 +36,9 @@ class PubmedFileParser(FileParser): ...@@ -34,7 +36,9 @@ class PubmedFileParser(FileParser):
for xml_author in xml_node for xml_author in xml_node
]) ])
else: else:
metadata[key] = xml_node.text if metadata[key]:
metadata[key] += '\n'
metadata[key] += xml_node.text
except: except:
pass pass
yield metadata yield metadata
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment