Commit 755a8d4d authored by Mathieu Rodic's avatar Mathieu Rodic

[FEATURE] The parser is working, directly from a Node instance!

parent c50f2fff
from django.db import models
from django.utils import timezone
from django.contrib.auth.models import User
from django_hstore import hstore
from cte_tree.models import CTENode, Manager
#from cte_tree.fields import DepthField, PathField, OrderingField
from parsing.Caches import LanguagesCache
from parsing.FileParsers import *
from time import time
from collections import defaultdict
from django.contrib.auth.models import User
from collections import defaultdict
# Some usefull functions
# TODO: start the function name with an underscore (private)
......@@ -28,7 +31,7 @@ class Language(models.Model):
def __str__(self):
return self.fullname
class DatabaseType(models.Model):
class ResourceType(models.Model):
name = models.CharField(max_length=255)
def __str__(self):
return self.name
......@@ -40,7 +43,7 @@ class Ngram(models.Model):
class Resource(models.Model):
guid = models.CharField(max_length=255)
bdd_type = models.ForeignKey(DatabaseType, blank=True, null=True)
type = models.ForeignKey(ResourceType, blank=True, null=True)
file = models.FileField(upload_to=upload_to, blank=True)
digest = models.CharField(max_length=32) # MD5 digest
......@@ -89,12 +92,33 @@ class Node(CTENode):
node_resource.save()
return resource
def parse(self):
# TODO: that's not very pretty...
# can't we make a simple join in Django?
def parse_resources(self):
# parse all resources into a list of metadata
metadata_list = []
for node_resource in self.node_resource.filter(parsed=False):
# TODO: call parsers here
print(node_resource.resource.file)
resource = node_resource.resource
parser = defaultdict(lambda:FileParser.FileParser, {
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
})[resource.type.name]()
print(parser)
metadata_list += parser.parse(str(resource.file))
# insert in the database!
type = NodeType.objects.get(name='Document')
langages_cache = LanguagesCache()
Node.objects.bulk_create([
Node(
user = self.user,
type = type,
name = metadata['title'] if 'title' in metadata else '',
parent = self,
language = langages_cache[metadata['language_iso2']] if 'language_iso2' in metadata else None,
metadata = metadata,
)
for metadata in metadata_list
])
def extract_ngrams(self, keys, cache):
# what do we want from the cache?
......@@ -118,6 +142,7 @@ class Node(CTENode):
weight = weight
)
class Node_Resource(models.Model):
node = models.ForeignKey(Node, related_name='node_resource')
resource = models.ForeignKey(Resource)
......@@ -126,7 +151,7 @@ class Node_Resource(models.Model):
class Node_Ngram(models.Model):
node = models.ForeignKey(Node)
ngram = models.ForeignKey(Ngram)
weight = models.IntegerField()
weight = models.FloatField()
class Project(Node):
class Meta:
......
import collections
from node.models import Ngram
import node.models
from parsing.NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor
from collections import defaultdict
class NgramsCache(collections.defaultdict):
class NgramsCache(defaultdict):
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time.
This class is language-specific."""
......@@ -17,14 +17,14 @@ class NgramsCache(collections.defaultdict):
"""If the terms are not yet present in the dictionary,
retrieve it from the database or insert it."""
try:
ngram = Ngram.get(terms=terms, language=self.language)
ngram = node.models.Ngram.get(terms=terms, language=self.language)
except:
ngram = Ngram(terms=terms, n=len(terms.split()), language=self.language)
ngram = node.models.Ngram(terms=terms, n=len(terms.split()), language=self.language)
ngram.save()
self[terms] = ngram
return self[terms]
class NgramsCaches(collections.defaultdict):
class NgramsCaches(defaultdict):
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time."""
def __missing__(self, language):
......@@ -33,7 +33,7 @@ class NgramsCaches(collections.defaultdict):
self[language] = NgramsCache(language)
return self[language]
class NgramsExtractorsCache(collections.defaultdict):
class NgramsExtractorsCache(defaultdict):
"""This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time."""
def __missing__(self, key):
......@@ -64,11 +64,25 @@ class NgramsExtractorsCache(collections.defaultdict):
# return the proper extractor
return self[key]
class LanguagesCache(defaultdict):
def __init__(self):
for language in node.models.Language.objects.all():
self[language.iso2.lower()] = language
self[language.iso3.lower()] = language
self[language.fullname.lower()] = language
def __missing__(self, key):
betterKey = key.strip().lower()
self[key] = self[betterKey] if betterKey in self else None
return self[betterKey]
class Cache:
class Caches:
"""This is THE cache of the caches.
See NgramsCaches and NgramsExtractorsCache for better understanding."""
def __init__(self):
self.ngrams = NgramsCaches()
self.extractors = NgramsExtractorsCache()
self.languages = LanguagesCache()
from node.models import Node, NodeType, Language, Ngram, Node_Ngram
from parsing.NgramsExtractors import *
import collections
import dateutil.parser
import zipfile
from parsing.Caches import LanguagesCache
class FileParser:
"""Base class for performing files parsing depending on their type.
"""
def __init__(self, file=None, filepath="", encoding="utf8"):
# ...get the file item...
if file is None:
self._file = open(filepath, "rb")
else:
self._file = file
# cache for ngrams
self._ngramcaches = NgramCaches()
# extractors
self._extractors = dict()
self._document_nodetype = NodeType.objects.get(name='Document')
languages = Language.objects.all()
self._languages_fullname = {language.fullname.lower(): language for language in languages}
self._languages_iso2 = {language.iso2.lower(): language for language in languages}
self._languages_iso3 = {language.iso3.lower(): language for language in languages}
def extract_ngrams(self, text, language):
"""Extract the ngrams from a given text.
"""
# Get the appropriate ngrams extractor, if it exists
if language not in self._extractors:
extractor = None
if language.iso2 == 'en':
extractor = EnglishNgramsExtractor()
elif language.iso2 == 'fr':
extractor = FrenchNgramsExtractor()
self._extractors[language] = extractor
else:
extractor = self._extractors[language]
# Extract the ngrams
if extractor:
tokens = []
for ngram in extractor.extract_ngrams(text):
ngram_text = ' '.join([token for token, tag in ngram])
tokens.append(ngram_text)
return collections.Counter(tokens)
else:
return dict()
def create_document(self, parentNode, title, metadata, guid=None):
"""Add a document to the database.
"""
metadata = self.format_metadata(metadata)
# create or retrieve a resource for that document, based on its user id
# if guid is None:
# resource = Resource(guid=guid)
# else:
# try:
# resource = Resource.get(guid=guid)
# except:
# resource = Resource(guid=guid)
# # If the parent node already has a child with this resource, pass
# # (is it a good thing?)
# if parentNode.descendants().filter(resource=resource).exists():
# return None
# create the document itself
try:
language = self._languages_iso3[metadata["language_iso3"]]
except:
language = None
childNode = Node(
user = parentNode.user,
type = self._document_nodetype,
name = title,
language = language,
metadata = metadata,
#resource = resource,
parent = parentNode
)
childNode.save()
return childNode
def __init__(self, language_cache=None):
self._languages_cache = LanguagesCache() if language_cache is None else language_cache
def detect_encoding(self, string):
"""Useful method to detect the document encoding.
"""
pass
def _parse(self, parentNode, file):
"""This method shall be overriden by inherited classes."""
return list()
def parse(self, parentNode, file=None):
"""Parse the files found in the file.
This method shall be overriden by inherited classes.
"""
if file is None:
with transaction.atomic():
self.parse(parentNode, self._file)
if zipfile.is_zipfile(file):
with zipfile.ZipFile(file) as zipArchive:
for filename in zipArchive.namelist():
self.parse(parentNode, zipArchive.open(filename, "r"))
else:
self._parse(parentNode, file)
def extract(self, parentNode, keys):
"""Extract ngrams from the child nodes, given a list of field names."""
# get all the descendants of type "document"
childNodes = parentNode.descendants().filter(type=self._document_nodetype)
with transaction.atomic():
for childNode in childNodes:
# most importantly...
metadata = childNode.metadata
# which extractor shall we use?
if language not in self._extractors:
extractor = None
if language.iso2 == 'en':
# use English
extractor = EnglishNgramsExtractor()
elif language.iso2 == 'fr':
# use French
extractor = FrenchNgramsExtractor()
else:
# no recognized language has been specified...
continue
self._extractors[language] = extractor
# extract ngrams from every field, find the id, count them
ngrams = collections.defaultdict(int)
ngramscache = self._ngramcaches[language]
for key in keys:
for ngram in extractor.extract_ngrams(text):
ngram_text = ' '.join([token for token, tag in ngram])
ngram_id = ngramscache[ngramtext].id
ngrams[ngram_id] += 1
# insert node/ngram associations in the database
for ngram_id, occurences in ngrams.items():
Node_Ngram(
node_id = childNode.id,
ngram_id = ngram_id,
occurences = occurences
).save()
def format_metadata_dates(self, metadata):
"""Format the dates found in the metadata.
Examples:
{"publication_date": "2014-10-23 09:57:42"}
-> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014"}
-> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014", ...}
{"publication_year": "2014"}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
"""
# First, check the split dates...
......@@ -189,31 +62,24 @@ class FileParser:
metadata[prefix + "_minute"] = date.strftime("%M")
metadata[prefix + "_second"] = date.strftime("%S")
# finally, return the result!
# finally, return the transformed result!
return metadata
def format_metadata_languages(self, metadata):
"""format the languages found in the metadata."""
try:
if "language_fullname" in metadata:
language = self._languages_fullname[metadata["language_fullname"].lower()]
elif "language_iso3" in metadata:
language = self._languages_iso3[metadata["language_iso3"].lower()]
elif "language_iso2" in metadata:
language = self._languages_iso2[metadata["language_iso2"].lower()]
else:
return metadata
except KeyError:
# the language has not been found
for key in ["language_fullname", "language_iso3", "language_iso2"]:
try:
metadata.pop(key)
except:
continue
return metadata
language = None
for key in ["fullname", "iso3", "iso2"]:
if key in metadata:
language_symbol = metadata["language_" + key]
language = self._languages_cache[language_symbol]
if language:
break
if language:
metadata["language_iso2"] = language.iso2
metadata["language_iso3"] = language.iso3
metadata["language_fullname"] = language.fullname
return metadata
def format_metadata(self, metadata):
"""Format the metadata."""
metadata = self.format_metadata_dates(metadata)
......@@ -221,3 +87,23 @@ class FileParser:
return metadata
def _parse(self, file):
"""This method shall be overriden by inherited classes."""
return list()
def parse(self, file):
"""Parse the file, and its children files found in the file.
"""
# initialize the list of metadata
metadata_list = []
# is the file is a ZIP archive, recurse on each of its files...
if zipfile.is_zipfile(file):
zipArchive = zipfile.ZipFile(file)
for filename in zipArchive.namelist():
metadata_list += self.parse(zipArchive.open(filename, "r"))
# ...otherwise, let's parse it directly!
else:
metadata_list += self._parse(file)
# return the list of formatted metadata
return map(self.format_metadata, metadata_list)
......@@ -5,16 +5,14 @@ from parsing.NgramsExtractors import *
class PubmedFileParser(FileParser):
def _parse(self, parentNode, file):
def _parse(self, file):
# open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
xml = etree.parse(file, parser=xml_parser)
xml_articles = xml.findall('PubmedArticle')
with transaction.atomic():
# initialize the list of documents
documents = []
# initialize the list of metadata
metadata_list = []
# parse all the articles, one by one
# all database operations should be performed within one transaction
for xml_article in xml_articles:
# extract data from the document
metadata = {}
......@@ -34,15 +32,6 @@ class PubmedFileParser(FileParser):
metadata[key] = node.text
except:
metadata[key] = ""
contents = metadata["abstract"]
# create the document in the database
document = self.create_document(
parentNode = parentNode,
title = metadata["title"],
metadata = metadata,
#guid = metadata["doi"],
)
if document:
documents.append(document)
# return the list of documents
return documents
metadata_list.append(metadata)
# return the list of metadata
return metadata_list
......@@ -7,11 +7,11 @@ class RisFileParser(FileParser):
_parameters = {
}
def _parse(self, parentNode, file):
def _parse(self, file):
metadata_list = []
metadata = {}
last_key = None
last_values = []
with transaction.atomic():
for line in self._file:
if len(line) > 2:
parameter_key = line[:2]
......@@ -23,17 +23,9 @@ class RisFileParser(FileParser):
metadata[parameter["key"]] = separator.join(last_values)
elif parameter["type"] == "delimiter":
language = self._languages_fullname[metadata["language"].lower()]
# self.create_document(
# parentNode = parentNode,
# title = metadata["title"],
# metadata = metadata,
# guid = metadata["doi"]
# )
print(self.format_metadata(metadata))
print()
metadata = {}
metadata_list.append(metadata)
last_key = parameter_key
last_values = []
last_values.append(line[3:-1].decode())
self._file.close()
return metadata_list
#from .Taggers import *
#from .NgramsExtractors import *
from .FileParsers import *
from node.models import Node, NodeType
import zipfile
import collections
# import chardet
class Parser:
def __init__(self):
pass
def parse_file(self, file):
# CHECKER GUID!!!!!!!!!!!!!!!!!!!!!!!!!!!!
pass
def parse_node_fichier(self, node):
if node.fichier and zipfile.is_zipfile(node.fichier):
with zipfile.ZipFile(node.fichier, "r") as zipFile:
node_type = NodeType.objects.get(name="Document")
for filename in zipFile.namelist():
file = zipFile.open(filename, "r")
node.objects.create(
parent = node,
type = node_type,
user = node.user,
)
def parse_node(self, node):
for resource in node.resources:
if node.resources.file and zipfile.is_zipfile(node.resources.file):
with zipfile.ZipFile(node.resources.file, "r") as zipFile:
for filename in zipFile.namelist():
file = zipFile.open(filename, "r")
Node.objects.create(
parent = node,
type = NodeType.get(name="Document"),
user = node.user,
)
def parse_node_recursively(self, node):
self.parse_node(node)
for descendant in node.get_descendants():
self.parse_node(descendant)
from node.models import Node, NodeType, User, Language
from parsing.Caches import Cache
from node.models import Node, NodeType, User, Language, ResourceType
from parsing.Caches import Caches
try:
me = User.objects.get(username='Mat')
......@@ -7,6 +7,12 @@ except:
me = User(username='Mat')
me.save()
try:
typePubmed = ResourceType.get(name='pubmed')
except:
typePubmed = ResourceType(name='pubmed')
typePubmed.save()
try:
typeCorpus = NodeType.get(name='corpus')
typeDoc = NodeType.get(name='document')
......@@ -25,25 +31,24 @@ try:
except:
corpus = Node(name='My first corpus', type=typeCorpus, user=me)
corpus.save()
for i in range(64):
title = 'Document #%d' % i
Node(
user = me,
# type = self._document_nodetype,
name = title,
language = english,
metadata = {'title':title},
#resource = resource,
type = typeDoc,
parent = corpus
).save()
corpus.add_resource(file='/path/to/file')
corpus.parse()
exit()
cache = Cache()
# for i in range(64):
# title = 'Document #%d' % i
# Node(
# user = me,
# # type = self._document_nodetype,
# name = title,
# language = english,
# metadata = {'title':title},
# #resource = resource,
# type = typeDoc,
# parent = corpus
# ).save()
corpus.children.all().delete()
corpus.add_resource(file='./data_samples/pubmed.zip', type=typePubmed)
corpus.parse_resources()
cache = Caches()
for child in corpus.children.all():
print(child.id)
child.extract_ngrams(['title'], cache)
\ No newline at end of file
print('#%d\t%s\n%s\n\n' % (child.id, child.name, child.metadata['abstract']))
# child.extract_ngrams(['title'], cache)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment