Commit bbb9c801 authored by Administrator's avatar Administrator

Cleaning unfinished work

parent a57cf30d
class EuropressFileParser(FileParser, contents):
def parse:
pass
\ No newline at end of file
#import FileParser
#
#class EuropressFileParser(FileParser, contents):
#
# def parse():
# pass
#
......@@ -76,18 +76,18 @@ class FileParser:
resource = Resource(guid=guid)
# If the parent node already has a child with this resource, pass
# (is it a good thing?)
if parentNode.get_descendants().
if parentNode.get_descendants():
# create the document itself
childNode = Node(
user = parentNode.pk,
type = self._document_nodetype,
name = title,
language = language
metadata = metadata
resource = resource
)
parentNode.add_child(childNode)
childNode = Node(
user = parentNode.pk,
type = self._document_nodetype,
name = title,
language = language,
metadata = metadata,
resource = resource
)
parentNode.add_child(childNode)
# parse it!
ngrams = self.extract_ngrams(contents, language)
# we should already be in a transaction, so no use doing another one (or is there?)
......
from django.db import transaction
from FileParser import FileParser
from parsing.FileParsers.FileParser import FileParser
class PubmedFileParser(FileParser):
......@@ -21,18 +21,18 @@ class PubmedFileParser(FileParser):
# other metadata should also be included:
# authors, submission date, etc.
"date_pub": datetime.date(year, month, day),
"journal": xml_article.find('MedlineCitation/Article/Journal/Title').text
"title": xml_article.find('MedlineCitation/Article/ArticleTitle').text
"language_iso3": xml_article.find('MedlineCitation/Article/Language').text
"journal": xml_article.find('MedlineCitation/Article/Journal/Title').text,
"title": xml_article.find('MedlineCitation/Article/ArticleTitle').text,
"language_iso3": xml_article.find('MedlineCitation/Article/Language').text,
"doi": xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]').text
}
contents = xml_article.find('MedlineCitation/Article/Abstract/AbstractText').text
# create the document in the database
yield self.create_document(
parentNode = parentNode
parentNode = parentNode,
title = metadata["title"],
contents = contents,
language = self._languages_iso3[metadata["language"].lower()]
language = self._languages_iso3[metadata["language"].lower()],
metadata = metadata,
guid = metadata["doi"],
)
from EuropressFileParser import EuropressFileParser
from PubmedFileParser import PubmedFileParser
\ No newline at end of file
#from parsing.FileParsers import EuropressFileParser
from parsing.FileParsers import PubmedFileParser
from Taggers.NltkTagger import NltkTagger
from Taggers.TreeTagger import TreeTagger
\ No newline at end of file
from Taggers.TreeTagger import TreeTagger
from Taggers import *
from NgramsExtractors import *
from FileParsers import *
#from .Taggers import *
#from .NgramsExtractors import *
from .FileParsers import *
import zipfile
import Collections
import collections
# import chardet
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment