Commit f1c12e0f authored by Administrator's avatar Administrator

testing file parser and error get NodeType (there is no label)

parent bbb9c801
...@@ -35,7 +35,7 @@ class FileParser: ...@@ -35,7 +35,7 @@ class FileParser:
self._ngramcaches = collections.defaultdicts(NgramCache) self._ngramcaches = collections.defaultdicts(NgramCache)
# extractors # extractors
self._extractors = dict() self._extractors = dict()
self._document_nodetype = NodeType.get(label='document') self._document_nodetype = NodeType.get(name='Document')
with Language.objects.all() as languages: with Language.objects.all() as languages:
self._languages_iso2 = {language.iso2.lower(): language for language in Language} self._languages_iso2 = {language.iso2.lower(): language for language in Language}
self._languages_iso3 = {language.iso3.lower(): language for language in Language} self._languages_iso3 = {language.iso3.lower(): language for language in Language}
......
...@@ -6,7 +6,7 @@ class PubmedFileParser(FileParser): ...@@ -6,7 +6,7 @@ class PubmedFileParser(FileParser):
def parse(self, parentNode): def parse(self, parentNode):
# open the file as XML # open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False,recover=True) xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
xml = etree.parse(self._file, parser=xml_parser) xml = etree.parse(self._file, parser=xml_parser)
# parse all the articles, one by one # parse all the articles, one by one
# all database operations should be performed within one transaction # all database operations should be performed within one transaction
......
#from .Taggers import * #from .Taggers import *
#from .NgramsExtractors import * #from .NgramsExtractors import *
from .FileParsers import * from .FileParsers import *
from node.models import Node, NodeType
import zipfile import zipfile
import collections import collections
...@@ -18,13 +18,26 @@ class Parser: ...@@ -18,13 +18,26 @@ class Parser:
# CHECKER GUID!!!!!!!!!!!!!!!!!!!!!!!!!!!! # CHECKER GUID!!!!!!!!!!!!!!!!!!!!!!!!!!!!
pass pass
def parse_node_fichier(self, node):
if node.fichier and zipfile.is_zipfile(node.fichier):
with zipfile.ZipFile(node.fichier, "r") as zipFile:
node_type = NodeType.objects.get(name="Document")
for filename in zipFile.namelist():
file = zipFile.open(filename, "r")
node.objects.create(
parent = node,
type = node_type,
user = node.user,
)
def parse_node(self, node): def parse_node(self, node):
for resource in node.resources: for resource in node.resources:
if node.resources.file and zipfile.is_zipfile(node.resources.file): if node.resources.file and zipfile.is_zipfile(node.resources.file):
with zipfile.ZipFile(node.resources.file, "r") as zipFile: with zipfile.ZipFile(node.resources.file, "r") as zipFile:
for filename in zipFile.namelist(): for filename in zipFile.namelist():
file = zipFile.open(filename, "r") file = zipFile.open(filename, "r")
node.add_child( Node.objects.create(
parent = node,
type = NodeType.get(name="Document"), type = NodeType.get(name="Document"),
user = node.user, user = node.user,
......
import parsing
from node.models import Node
node = Node.objects.get(name="PubMed corpus")
parser = parsing.Parser()
parser.parse_node_fichier(node)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment