Commit 4381e35a authored by Mathieu Rodic's avatar Mathieu Rodic

Started working on the ngrams extractors.

Changed the models a little.
parent db5a17ce
from NgramsExtractor import NgramsExtractor
class EnglishNgramsExtractor(NgramsExtractor):
pass
\ No newline at end of file
from NgramsExtractor import NgramsExtractor
class FrenchNgramsExtractor(NgramsExtractor):
pass
\ No newline at end of file
"""Base class for all ngrams extractors.
"""
class NgramsExtractor:
"""Class instanciation.
This method can be overriden.
"""
def __init__(self):
pass
def tag_ngrams(self, contents):
return []
"""Extracts a list of ngrams.
Returns a list of the ngrams found in the given text.
"""
def extract_ngrams(self, contents):
tagged_ngrams = self.tag_ngrams()
\ No newline at end of file
from FrenchNgramsExtractor import FrenchNgramsExtractor
from EnglishNgramsExtractor import EnglishNgramsExtractor
\ No newline at end of file
......@@ -9,6 +9,8 @@ from time import time
from django.contrib.auth.models import User
from language import Language
from collections import defaultdict
def upload_to(instance, filename):
return 'corpora/%s/%f/%s' % (instance.user.username, time(), filename)
......@@ -24,8 +26,38 @@ class Language(models.Model):
class Ngram(models.Model):
language = models.ForeignKey(Language, blank=True, null=True, on_delete=models.SET_NULL)
n = models.IntegerField()
terms = models.CharField(max_length=255)
class class Ngram_Cache:
def __init__(self, language):
self._language_id = {}
self._ngram_ids = []
# get the language id
language = language.lower()
if len(language) == "3":
self._language_id = Language.get(iso3=language).id
elif len(language) == "2":
self._language_id = Language.get(iso2=language).id
else:
import pycountry
pycountry.languages.get(alpha2='an')
def get(self, language, terms):
# get the term id
terms = terms.strip().lower()
if terms not in self._cache[language]:
try:
ngram = NGram.get(terms=terms)
except:
ngram = NGram(terms=terms, n=len(terms), language_id=self._language_id)
ngram.save()
self._cache[language][terms] = ngram.pk
# return the term id
return self._cache[language][terms]
class Resource(models.Model):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment