Commit be4cd18b authored by Mathieu Rodic's avatar Mathieu Rodic

The file parsers are almost done.

Todo: incorporate the extracted ngrams and link them to the given document.
parent 95f90782
......@@ -12,10 +12,35 @@ class FileParser:
self._file = file
# ...and parse!
self.parse()
# extractors
self._extractors = {}
self._document_nodetype = NodeType.get(label='document')
with Language.objects.all() as languages:
self._languages_iso2 = {language.iso2.lower(): language for language in Language}
self._languages_iso3 = {language.iso3.lower(): language for language in Language}
"""Extract the ngrams from a given text.
"""
def extract_ngrams(self, text, language):
# Get the appropriate ngrams extractor, if it exists
if language not in self._extractors:
extractor = None
if language == 'en':
extractor = EnglishNgramsExtractor()
elif language == 'fr':
extractor = FrenchNgramsExtractor()
self._extractors[language] = extractor
else:
extractor = self._extractors[language]
# Extract the
if extractor:
return extractor.extract_ngrams(text)
else:
return []
"""Add a document to the database.
"""
def create_document(self, title, contents, language, metadata, guid=None):
def create_document(self, parentNode, title, contents, language, metadata, guid=None):
# create or retrieve a resource for that document, based on its user id
if guid is None:
resource = Resource(guid=guid)
......@@ -25,19 +50,19 @@ class FileParser:
except:
resource = Resource(guid=guid)
# create the document itself
document = Node(
# WRITE STUFF HERE!!!
childNode = Node(
user = parentNode.pk,
type = self._document_nodetype,
name = title,
language = language
metadata = metadata
resource = resource
)
parentNode.add_child(childNode)
# parse it!
# TODO: beware the language!!!!
if self._parsers[language] = None:
self._parsers[language] = NltkParser
# WRITE STUFF HERE!!!
ngrams = self.extract_ngrams(contents, language)
for
# return the created document
return document
......
......@@ -20,19 +20,19 @@ class PubmedFileParser(FileParser):
metadata = {
# other metadata should also be included:
# authors, submission date, etc.
"date_pub": datetime.date(year, month, day),
"journal": xml_article.find('MedlineCitation/Article/Journal/Title').text
"title": xml_article.find('MedlineCitation/Article/ArticleTitle').text
"language": xml_article.find('MedlineCitation/Article/Language').text
"doi": xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]').text
"date_pub": datetime.date(year, month, day),
"journal": xml_article.find('MedlineCitation/Article/Journal/Title').text
"title": xml_article.find('MedlineCitation/Article/ArticleTitle').text
"language_iso3": xml_article.find('MedlineCitation/Article/Language').text
"doi": xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]').text
}
contents = xml_article.find('MedlineCitation/Article/Abstract/AbstractText').text
# create the document in the database
childNode = self.create_document(
yield self.create_document(
parentNode = parentNode
title = metadata["title"],
contents = contents,
language = metadata["language"],
language = self._languages_iso3[metadata["language"].lower()]
metadata = metadata,
guid = metadata["doi"],
)
parentNode.add_child(childNode)
......@@ -37,7 +37,7 @@ class NgramsExtractor:
for subtree in grammar_parsed.subtrees():
if subtree.label() == self._label:
result.append(subtree.leaves())
except Exception as e:
except:
print("Problem while parsing rule '%s'" % (self._rule, ))
pass
return iter(result)
......
......@@ -83,7 +83,7 @@ class Node(MP_Node):
# the 'file' column should be deprecated soon;
# use resources instead.
file = models.FileField(upload_to=upload_to, blank=True)
resources = models.ManyToManyField(Resource)
resource = models.ForeignKey(Resource)
#objects = hstore.HStoreManager()
def __str__(self):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment