Commit 755a8d4d authored by Mathieu Rodic's avatar Mathieu Rodic

[FEATURE] The parser is working, directly from a Node instance!

parent c50f2fff
from django.db import models from django.db import models
from django.utils import timezone from django.utils import timezone
from django.contrib.auth.models import User
from django_hstore import hstore from django_hstore import hstore
from cte_tree.models import CTENode, Manager from cte_tree.models import CTENode, Manager
#from cte_tree.fields import DepthField, PathField, OrderingField #from cte_tree.fields import DepthField, PathField, OrderingField
from parsing.Caches import LanguagesCache
from parsing.FileParsers import *
from time import time from time import time
from collections import defaultdict
from django.contrib.auth.models import User
from collections import defaultdict
# Some usefull functions # Some usefull functions
# TODO: start the function name with an underscore (private) # TODO: start the function name with an underscore (private)
...@@ -28,7 +31,7 @@ class Language(models.Model): ...@@ -28,7 +31,7 @@ class Language(models.Model):
def __str__(self): def __str__(self):
return self.fullname return self.fullname
class DatabaseType(models.Model): class ResourceType(models.Model):
name = models.CharField(max_length=255) name = models.CharField(max_length=255)
def __str__(self): def __str__(self):
return self.name return self.name
...@@ -40,7 +43,7 @@ class Ngram(models.Model): ...@@ -40,7 +43,7 @@ class Ngram(models.Model):
class Resource(models.Model): class Resource(models.Model):
guid = models.CharField(max_length=255) guid = models.CharField(max_length=255)
bdd_type = models.ForeignKey(DatabaseType, blank=True, null=True) type = models.ForeignKey(ResourceType, blank=True, null=True)
file = models.FileField(upload_to=upload_to, blank=True) file = models.FileField(upload_to=upload_to, blank=True)
digest = models.CharField(max_length=32) # MD5 digest digest = models.CharField(max_length=32) # MD5 digest
...@@ -89,12 +92,33 @@ class Node(CTENode): ...@@ -89,12 +92,33 @@ class Node(CTENode):
node_resource.save() node_resource.save()
return resource return resource
def parse(self): def parse_resources(self):
# TODO: that's not very pretty... # parse all resources into a list of metadata
# can't we make a simple join in Django? metadata_list = []
for node_resource in self.node_resource.filter(parsed=False): for node_resource in self.node_resource.filter(parsed=False):
# TODO: call parsers here resource = node_resource.resource
print(node_resource.resource.file) parser = defaultdict(lambda:FileParser.FileParser, {
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
})[resource.type.name]()
print(parser)
metadata_list += parser.parse(str(resource.file))
# insert in the database!
type = NodeType.objects.get(name='Document')
langages_cache = LanguagesCache()
Node.objects.bulk_create([
Node(
user = self.user,
type = type,
name = metadata['title'] if 'title' in metadata else '',
parent = self,
language = langages_cache[metadata['language_iso2']] if 'language_iso2' in metadata else None,
metadata = metadata,
)
for metadata in metadata_list
])
def extract_ngrams(self, keys, cache): def extract_ngrams(self, keys, cache):
# what do we want from the cache? # what do we want from the cache?
...@@ -118,6 +142,7 @@ class Node(CTENode): ...@@ -118,6 +142,7 @@ class Node(CTENode):
weight = weight weight = weight
) )
class Node_Resource(models.Model): class Node_Resource(models.Model):
node = models.ForeignKey(Node, related_name='node_resource') node = models.ForeignKey(Node, related_name='node_resource')
resource = models.ForeignKey(Resource) resource = models.ForeignKey(Resource)
...@@ -126,7 +151,7 @@ class Node_Resource(models.Model): ...@@ -126,7 +151,7 @@ class Node_Resource(models.Model):
class Node_Ngram(models.Model): class Node_Ngram(models.Model):
node = models.ForeignKey(Node) node = models.ForeignKey(Node)
ngram = models.ForeignKey(Ngram) ngram = models.ForeignKey(Ngram)
weight = models.IntegerField() weight = models.FloatField()
class Project(Node): class Project(Node):
class Meta: class Meta:
......
import collections import node.models
from node.models import Ngram
from parsing.NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor from parsing.NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor
from collections import defaultdict
class NgramsCache(collections.defaultdict): class NgramsCache(defaultdict):
"""This allows the fast retrieval of ngram ids """This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time. from a cache instead of calling the database every time.
This class is language-specific.""" This class is language-specific."""
...@@ -17,14 +17,14 @@ class NgramsCache(collections.defaultdict): ...@@ -17,14 +17,14 @@ class NgramsCache(collections.defaultdict):
"""If the terms are not yet present in the dictionary, """If the terms are not yet present in the dictionary,
retrieve it from the database or insert it.""" retrieve it from the database or insert it."""
try: try:
ngram = Ngram.get(terms=terms, language=self.language) ngram = node.models.Ngram.get(terms=terms, language=self.language)
except: except:
ngram = Ngram(terms=terms, n=len(terms.split()), language=self.language) ngram = node.models.Ngram(terms=terms, n=len(terms.split()), language=self.language)
ngram.save() ngram.save()
self[terms] = ngram self[terms] = ngram
return self[terms] return self[terms]
class NgramsCaches(collections.defaultdict): class NgramsCaches(defaultdict):
"""This allows the fast retrieval of ngram ids """This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time.""" from a cache instead of calling the database every time."""
def __missing__(self, language): def __missing__(self, language):
...@@ -33,7 +33,7 @@ class NgramsCaches(collections.defaultdict): ...@@ -33,7 +33,7 @@ class NgramsCaches(collections.defaultdict):
self[language] = NgramsCache(language) self[language] = NgramsCache(language)
return self[language] return self[language]
class NgramsExtractorsCache(collections.defaultdict): class NgramsExtractorsCache(defaultdict):
"""This allows the fast retrieval of ngram ids """This allows the fast retrieval of ngram ids
from a cache instead of calling the database every time.""" from a cache instead of calling the database every time."""
def __missing__(self, key): def __missing__(self, key):
...@@ -64,11 +64,25 @@ class NgramsExtractorsCache(collections.defaultdict): ...@@ -64,11 +64,25 @@ class NgramsExtractorsCache(collections.defaultdict):
# return the proper extractor # return the proper extractor
return self[key] return self[key]
class LanguagesCache(defaultdict):
def __init__(self):
for language in node.models.Language.objects.all():
self[language.iso2.lower()] = language
self[language.iso3.lower()] = language
self[language.fullname.lower()] = language
def __missing__(self, key):
betterKey = key.strip().lower()
self[key] = self[betterKey] if betterKey in self else None
return self[betterKey]
class Cache: class Caches:
"""This is THE cache of the caches. """This is THE cache of the caches.
See NgramsCaches and NgramsExtractorsCache for better understanding.""" See NgramsCaches and NgramsExtractorsCache for better understanding."""
def __init__(self): def __init__(self):
self.ngrams = NgramsCaches() self.ngrams = NgramsCaches()
self.extractors = NgramsExtractorsCache() self.extractors = NgramsExtractorsCache()
self.languages = LanguagesCache()
from node.models import Node, NodeType, Language, Ngram, Node_Ngram
from parsing.NgramsExtractors import *
import collections import collections
import dateutil.parser import dateutil.parser
import zipfile import zipfile
from parsing.Caches import LanguagesCache
class FileParser: class FileParser:
"""Base class for performing files parsing depending on their type. """Base class for performing files parsing depending on their type.
""" """
def __init__(self, language_cache=None):
def __init__(self, file=None, filepath="", encoding="utf8"): self._languages_cache = LanguagesCache() if language_cache is None else language_cache
# ...get the file item...
if file is None:
self._file = open(filepath, "rb")
else:
self._file = file
# cache for ngrams
self._ngramcaches = NgramCaches()
# extractors
self._extractors = dict()
self._document_nodetype = NodeType.objects.get(name='Document')
languages = Language.objects.all()
self._languages_fullname = {language.fullname.lower(): language for language in languages}
self._languages_iso2 = {language.iso2.lower(): language for language in languages}
self._languages_iso3 = {language.iso3.lower(): language for language in languages}
def extract_ngrams(self, text, language):
"""Extract the ngrams from a given text.
"""
# Get the appropriate ngrams extractor, if it exists
if language not in self._extractors:
extractor = None
if language.iso2 == 'en':
extractor = EnglishNgramsExtractor()
elif language.iso2 == 'fr':
extractor = FrenchNgramsExtractor()
self._extractors[language] = extractor
else:
extractor = self._extractors[language]
# Extract the ngrams
if extractor:
tokens = []
for ngram in extractor.extract_ngrams(text):
ngram_text = ' '.join([token for token, tag in ngram])
tokens.append(ngram_text)
return collections.Counter(tokens)
else:
return dict()
def create_document(self, parentNode, title, metadata, guid=None):
"""Add a document to the database.
"""
metadata = self.format_metadata(metadata)
# create or retrieve a resource for that document, based on its user id
# if guid is None:
# resource = Resource(guid=guid)
# else:
# try:
# resource = Resource.get(guid=guid)
# except:
# resource = Resource(guid=guid)
# # If the parent node already has a child with this resource, pass
# # (is it a good thing?)
# if parentNode.descendants().filter(resource=resource).exists():
# return None
# create the document itself
try:
language = self._languages_iso3[metadata["language_iso3"]]
except:
language = None
childNode = Node(
user = parentNode.user,
type = self._document_nodetype,
name = title,
language = language,
metadata = metadata,
#resource = resource,
parent = parentNode
)
childNode.save()
return childNode
def detect_encoding(self, string): def detect_encoding(self, string):
"""Useful method to detect the document encoding. """Useful method to detect the document encoding.
""" """
pass pass
def _parse(self, parentNode, file):
"""This method shall be overriden by inherited classes."""
return list()
def parse(self, parentNode, file=None):
"""Parse the files found in the file.
This method shall be overriden by inherited classes.
"""
if file is None:
with transaction.atomic():
self.parse(parentNode, self._file)
if zipfile.is_zipfile(file):
with zipfile.ZipFile(file) as zipArchive:
for filename in zipArchive.namelist():
self.parse(parentNode, zipArchive.open(filename, "r"))
else:
self._parse(parentNode, file)
def extract(self, parentNode, keys):
"""Extract ngrams from the child nodes, given a list of field names."""
# get all the descendants of type "document"
childNodes = parentNode.descendants().filter(type=self._document_nodetype)
with transaction.atomic():
for childNode in childNodes:
# most importantly...
metadata = childNode.metadata
# which extractor shall we use?
if language not in self._extractors:
extractor = None
if language.iso2 == 'en':
# use English
extractor = EnglishNgramsExtractor()
elif language.iso2 == 'fr':
# use French
extractor = FrenchNgramsExtractor()
else:
# no recognized language has been specified...
continue
self._extractors[language] = extractor
# extract ngrams from every field, find the id, count them
ngrams = collections.defaultdict(int)
ngramscache = self._ngramcaches[language]
for key in keys:
for ngram in extractor.extract_ngrams(text):
ngram_text = ' '.join([token for token, tag in ngram])
ngram_id = ngramscache[ngramtext].id
ngrams[ngram_id] += 1
# insert node/ngram associations in the database
for ngram_id, occurences in ngrams.items():
Node_Ngram(
node_id = childNode.id,
ngram_id = ngram_id,
occurences = occurences
).save()
def format_metadata_dates(self, metadata): def format_metadata_dates(self, metadata):
"""Format the dates found in the metadata. """Format the dates found in the metadata.
Examples: Examples:
{"publication_date": "2014-10-23 09:57:42"} {"publication_date": "2014-10-23 09:57:42"}
-> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014"} -> {"publication_date": "2014-10-23 09:57:42", "publication_year": "2014", ...}
{"publication_year": "2014"}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
""" """
# First, check the split dates... # First, check the split dates...
...@@ -189,35 +62,48 @@ class FileParser: ...@@ -189,35 +62,48 @@ class FileParser:
metadata[prefix + "_minute"] = date.strftime("%M") metadata[prefix + "_minute"] = date.strftime("%M")
metadata[prefix + "_second"] = date.strftime("%S") metadata[prefix + "_second"] = date.strftime("%S")
# finally, return the result! # finally, return the transformed result!
return metadata return metadata
def format_metadata_languages(self, metadata): def format_metadata_languages(self, metadata):
"""format the languages found in the metadata.""" """format the languages found in the metadata."""
try: language = None
if "language_fullname" in metadata: for key in ["fullname", "iso3", "iso2"]:
language = self._languages_fullname[metadata["language_fullname"].lower()] if key in metadata:
elif "language_iso3" in metadata: language_symbol = metadata["language_" + key]
language = self._languages_iso3[metadata["language_iso3"].lower()] language = self._languages_cache[language_symbol]
elif "language_iso2" in metadata: if language:
language = self._languages_iso2[metadata["language_iso2"].lower()] break
else: if language:
return metadata metadata["language_iso2"] = language.iso2
except KeyError: metadata["language_iso3"] = language.iso3
# the language has not been found metadata["language_fullname"] = language.fullname
for key in ["language_fullname", "language_iso3", "language_iso2"]:
try:
metadata.pop(key)
except:
continue
return metadata
metadata["language_iso2"] = language.iso2
metadata["language_iso3"] = language.iso3
metadata["language_fullname"] = language.fullname
return metadata return metadata
def format_metadata(self, metadata): def format_metadata(self, metadata):
"""Format the metadata.""" """Format the metadata."""
metadata = self.format_metadata_dates(metadata) metadata = self.format_metadata_dates(metadata)
metadata = self.format_metadata_languages(metadata) metadata = self.format_metadata_languages(metadata)
return metadata return metadata
def _parse(self, file):
"""This method shall be overriden by inherited classes."""
return list()
def parse(self, file):
"""Parse the file, and its children files found in the file.
"""
# initialize the list of metadata
metadata_list = []
# is the file is a ZIP archive, recurse on each of its files...
if zipfile.is_zipfile(file):
zipArchive = zipfile.ZipFile(file)
for filename in zipArchive.namelist():
metadata_list += self.parse(zipArchive.open(filename, "r"))
# ...otherwise, let's parse it directly!
else:
metadata_list += self._parse(file)
# return the list of formatted metadata
return map(self.format_metadata, metadata_list)
...@@ -5,44 +5,33 @@ from parsing.NgramsExtractors import * ...@@ -5,44 +5,33 @@ from parsing.NgramsExtractors import *
class PubmedFileParser(FileParser): class PubmedFileParser(FileParser):
def _parse(self, parentNode, file): def _parse(self, file):
# open the file as XML # open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False, recover=True) xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
xml = etree.parse(file, parser=xml_parser) xml = etree.parse(file, parser=xml_parser)
xml_articles = xml.findall('PubmedArticle') xml_articles = xml.findall('PubmedArticle')
with transaction.atomic(): # initialize the list of metadata
# initialize the list of documents metadata_list = []
documents = [] # parse all the articles, one by one
# parse all the articles, one by one for xml_article in xml_articles:
# all database operations should be performed within one transaction # extract data from the document
for xml_article in xml_articles: metadata = {}
# extract data from the document metadata_path = {
metadata = {} "journal" : 'MedlineCitation/Article/Journal/Title',
metadata_path = { "title" : 'MedlineCitation/Article/ArticleTitle',
"journal" : 'MedlineCitation/Article/Journal/Title', "language_iso3" : 'MedlineCitation/Article/Language',
"title" : 'MedlineCitation/Article/ArticleTitle', "doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
"language_iso3" : 'MedlineCitation/Article/Language', "abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]', "publication_year" : 'MedlineCitation/DateCreated/Year',
"abstract" : 'MedlineCitation/Article/Abstract/AbstractText', "publication_month" : 'MedlineCitation/DateCreated/Month',
"publication_year" : 'MedlineCitation/DateCreated/Year', "publication_day" : 'MedlineCitation/DateCreated/Day',
"publication_month" : 'MedlineCitation/DateCreated/Month', }
"publication_day" : 'MedlineCitation/DateCreated/Day', for key, path in metadata_path.items():
} try:
for key, path in metadata_path.items(): node = xml_article.find(path)
try: metadata[key] = node.text
node = xml_article.find(path) except:
metadata[key] = node.text metadata[key] = ""
except: metadata_list.append(metadata)
metadata[key] = "" # return the list of metadata
contents = metadata["abstract"] return metadata_list
# create the document in the database
document = self.create_document(
parentNode = parentNode,
title = metadata["title"],
metadata = metadata,
#guid = metadata["doi"],
)
if document:
documents.append(document)
# return the list of documents
return documents
...@@ -7,33 +7,25 @@ class RisFileParser(FileParser): ...@@ -7,33 +7,25 @@ class RisFileParser(FileParser):
_parameters = { _parameters = {
} }
def _parse(self, parentNode, file): def _parse(self, file):
metadata_list = []
metadata = {} metadata = {}
last_key = None last_key = None
last_values = [] last_values = []
with transaction.atomic(): for line in self._file:
for line in self._file: if len(line) > 2:
if len(line) > 2: parameter_key = line[:2]
parameter_key = line[:2] if parameter_key != b' ' and parameter_key != last_key:
if parameter_key != b' ' and parameter_key != last_key: if last_key in self._parameters:
if last_key in self._parameters: parameter = self._parameters[last_key]
parameter = self._parameters[last_key] if parameter["type"] == "metadata":
if parameter["type"] == "metadata": separator = parameter["separator"] if "separator" in parameter else ""
separator = parameter["separator"] if "separator" in parameter else "" metadata[parameter["key"]] = separator.join(last_values)
metadata[parameter["key"]] = separator.join(last_values) elif parameter["type"] == "delimiter":
elif parameter["type"] == "delimiter": language = self._languages_fullname[metadata["language"].lower()]
language = self._languages_fullname[metadata["language"].lower()] metadata_list.append(metadata)
# self.create_document( last_key = parameter_key
# parentNode = parentNode, last_values = []
# title = metadata["title"], last_values.append(line[3:-1].decode())
# metadata = metadata, return metadata_list
# guid = metadata["doi"]
# )
print(self.format_metadata(metadata))
print()
metadata = {}
last_key = parameter_key
last_values = []
last_values.append(line[3:-1].decode())
self._file.close()
#from .Taggers import *
#from .NgramsExtractors import *
from .FileParsers import *
from node.models import Node, NodeType
import zipfile
import collections
# import chardet
class Parser:
def __init__(self):
pass
def parse_file(self, file):
# CHECKER GUID!!!!!!!!!!!!!!!!!!!!!!!!!!!!
pass
def parse_node_fichier(self, node):
if node.fichier and zipfile.is_zipfile(node.fichier):
with zipfile.ZipFile(node.fichier, "r") as zipFile:
node_type = NodeType.objects.get(name="Document")
for filename in zipFile.namelist():
file = zipFile.open(filename, "r")
node.objects.create(
parent = node,
type = node_type,
user = node.user,
)
def parse_node(self, node):
for resource in node.resources:
if node.resources.file and zipfile.is_zipfile(node.resources.file):
with zipfile.ZipFile(node.resources.file, "r") as zipFile:
for filename in zipFile.namelist():
file = zipFile.open(filename, "r")
Node.objects.create(
parent = node,
type = NodeType.get(name="Document"),
user = node.user,
)
def parse_node_recursively(self, node):
self.parse_node(node)
for descendant in node.get_descendants():
self.parse_node(descendant)
from node.models import Node, NodeType, User, Language from node.models import Node, NodeType, User, Language, ResourceType
from parsing.Caches import Cache from parsing.Caches import Caches
try: try:
me = User.objects.get(username='Mat') me = User.objects.get(username='Mat')
...@@ -7,6 +7,12 @@ except: ...@@ -7,6 +7,12 @@ except:
me = User(username='Mat') me = User(username='Mat')
me.save() me.save()
try:
typePubmed = ResourceType.get(name='pubmed')
except:
typePubmed = ResourceType(name='pubmed')
typePubmed.save()
try: try:
typeCorpus = NodeType.get(name='corpus') typeCorpus = NodeType.get(name='corpus')
typeDoc = NodeType.get(name='document') typeDoc = NodeType.get(name='document')
...@@ -25,25 +31,24 @@ try: ...@@ -25,25 +31,24 @@ try:
except: except:
corpus = Node(name='My first corpus', type=typeCorpus, user=me) corpus = Node(name='My first corpus', type=typeCorpus, user=me)
corpus.save() corpus.save()
for i in range(64): # for i in range(64):
title = 'Document #%d' % i # title = 'Document #%d' % i
Node( # Node(
user = me, # user = me,
# type = self._document_nodetype, # # type = self._document_nodetype,
name = title, # name = title,
language = english, # language = english,
metadata = {'title':title}, # metadata = {'title':title},
#resource = resource, # #resource = resource,
type = typeDoc, # type = typeDoc,
parent = corpus # parent = corpus
).save() # ).save()
corpus.add_resource(file='/path/to/file')
corpus.parse()
exit() corpus.children.all().delete()
corpus.add_resource(file='./data_samples/pubmed.zip', type=typePubmed)
corpus.parse_resources()
cache = Cache() cache = Caches()
for child in corpus.children.all(): for child in corpus.children.all():
print(child.id) print('#%d\t%s\n%s\n\n' % (child.id, child.name, child.metadata['abstract']))
child.extract_ngrams(['title'], cache) # child.extract_ngrams(['title'], cache)
\ No newline at end of file \ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment