Commit bbb9c801 authored by Administrator's avatar Administrator

Cleaning unfinished work

parent a57cf30d
#import FileParser
class EuropressFileParser(FileParser, contents): #
#class EuropressFileParser(FileParser, contents):
def parse: #
pass # def parse():
# pass
\ No newline at end of file #
...@@ -76,18 +76,18 @@ class FileParser: ...@@ -76,18 +76,18 @@ class FileParser:
resource = Resource(guid=guid) resource = Resource(guid=guid)
# If the parent node already has a child with this resource, pass # If the parent node already has a child with this resource, pass
# (is it a good thing?) # (is it a good thing?)
if parentNode.get_descendants(). if parentNode.get_descendants():
# create the document itself # create the document itself
childNode = Node( childNode = Node(
user = parentNode.pk, user = parentNode.pk,
type = self._document_nodetype, type = self._document_nodetype,
name = title, name = title,
language = language language = language,
metadata = metadata metadata = metadata,
resource = resource resource = resource
) )
parentNode.add_child(childNode) parentNode.add_child(childNode)
# parse it! # parse it!
ngrams = self.extract_ngrams(contents, language) ngrams = self.extract_ngrams(contents, language)
# we should already be in a transaction, so no use doing another one (or is there?) # we should already be in a transaction, so no use doing another one (or is there?)
......
from django.db import transaction from django.db import transaction
from FileParser import FileParser from parsing.FileParsers.FileParser import FileParser
class PubmedFileParser(FileParser): class PubmedFileParser(FileParser):
...@@ -21,18 +21,18 @@ class PubmedFileParser(FileParser): ...@@ -21,18 +21,18 @@ class PubmedFileParser(FileParser):
# other metadata should also be included: # other metadata should also be included:
# authors, submission date, etc. # authors, submission date, etc.
"date_pub": datetime.date(year, month, day), "date_pub": datetime.date(year, month, day),
"journal": xml_article.find('MedlineCitation/Article/Journal/Title').text "journal": xml_article.find('MedlineCitation/Article/Journal/Title').text,
"title": xml_article.find('MedlineCitation/Article/ArticleTitle').text "title": xml_article.find('MedlineCitation/Article/ArticleTitle').text,
"language_iso3": xml_article.find('MedlineCitation/Article/Language').text "language_iso3": xml_article.find('MedlineCitation/Article/Language').text,
"doi": xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]').text "doi": xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]').text
} }
contents = xml_article.find('MedlineCitation/Article/Abstract/AbstractText').text contents = xml_article.find('MedlineCitation/Article/Abstract/AbstractText').text
# create the document in the database # create the document in the database
yield self.create_document( yield self.create_document(
parentNode = parentNode parentNode = parentNode,
title = metadata["title"], title = metadata["title"],
contents = contents, contents = contents,
language = self._languages_iso3[metadata["language"].lower()] language = self._languages_iso3[metadata["language"].lower()],
metadata = metadata, metadata = metadata,
guid = metadata["doi"], guid = metadata["doi"],
) )
from EuropressFileParser import EuropressFileParser #from parsing.FileParsers import EuropressFileParser
from PubmedFileParser import PubmedFileParser from parsing.FileParsers import PubmedFileParser
\ No newline at end of file
from Taggers.NltkTagger import NltkTagger from Taggers.NltkTagger import NltkTagger
from Taggers.TreeTagger import TreeTagger from Taggers.TreeTagger import TreeTagger
\ No newline at end of file
from Taggers import * #from .Taggers import *
from NgramsExtractors import * #from .NgramsExtractors import *
from FileParsers import * from .FileParsers import *
import zipfile import zipfile
import Collections import collections
# import chardet # import chardet
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment