Commit b96a7eec authored by Mathieu Rodic's avatar Mathieu Rodic

[FEATURE] The ngrams extractor on Node seems to be working.

parent 755a8d4d
...@@ -129,18 +129,21 @@ class Node(CTENode): ...@@ -129,18 +129,21 @@ class Node(CTENode):
if isinstance(keys, dict): if isinstance(keys, dict):
for key, weight in keys.items(): for key, weight in keys.items():
for ngram in extractor.extract_ngrams(self.metadata[key]): for ngram in extractor.extract_ngrams(self.metadata[key]):
associations[key] += weight terms = ' '.join([token for token, tag in ngram])
associations[ngram] += weight
else: else:
for key in keys: for key in keys:
for ngram in extractor.extract_ngrams(self.metadata[key]): for ngram in extractor.extract_ngrams(self.metadata[key]):
associations[key] += 1 terms = ' '.join([token for token, tag in ngram])
associations[terms] += 1
# insert the occurrences in the database # insert the occurrences in the database
# TODO: use bulk_create instead
for ngram_text, weight in associations.items(): for ngram_text, weight in associations.items():
Node_Ngram( Node_Ngram(
node = self, node = self,
ngram = ngrams[ngram_text], ngram = ngrams[ngram_text],
weight = weight weight = weight
) ).save()
class Node_Resource(models.Model): class Node_Resource(models.Model):
......
...@@ -42,8 +42,10 @@ class NgramsExtractorsCache(defaultdict): ...@@ -42,8 +42,10 @@ class NgramsExtractorsCache(defaultdict):
# format the language # format the language
if isinstance(key, str): if isinstance(key, str):
language = key.strip().lower() language = key.strip().lower()
else: elif key:
language = key.iso2 language = key.iso2
else:
language = None
# find the proper extractor # find the proper extractor
if language in ["en", "eng", "english"]: if language in ["en", "eng", "english"]:
Extractor = EnglishNgramsExtractor Extractor = EnglishNgramsExtractor
...@@ -66,15 +68,14 @@ class NgramsExtractorsCache(defaultdict): ...@@ -66,15 +68,14 @@ class NgramsExtractorsCache(defaultdict):
class LanguagesCache(defaultdict): class LanguagesCache(defaultdict):
def __init__(self):
for language in node.models.Language.objects.all():
self[language.iso2.lower()] = language
self[language.iso3.lower()] = language
self[language.fullname.lower()] = language
def __missing__(self, key): def __missing__(self, key):
if len(self) == 0:
for language in node.models.Language.objects.all():
self[str(language.iso2.lower())] = language
self[str(language.iso3.lower())] = language
self[str(language.fullname.lower())] = language
betterKey = key.strip().lower() betterKey = key.strip().lower()
self[key] = self[betterKey] if betterKey in self else None self[key] = self[betterKey] if betterKey in self.keys() else None
return self[betterKey] return self[betterKey]
......
...@@ -69,8 +69,9 @@ class FileParser: ...@@ -69,8 +69,9 @@ class FileParser:
"""format the languages found in the metadata.""" """format the languages found in the metadata."""
language = None language = None
for key in ["fullname", "iso3", "iso2"]: for key in ["fullname", "iso3", "iso2"]:
if key in metadata: language_key = "language_" + key
language_symbol = metadata["language_" + key] if language_key in metadata:
language_symbol = metadata[language_key]
language = self._languages_cache[language_symbol] language = self._languages_cache[language_symbol]
if language: if language:
break break
......
...@@ -31,18 +31,7 @@ try: ...@@ -31,18 +31,7 @@ try:
except: except:
corpus = Node(name='My first corpus', type=typeCorpus, user=me) corpus = Node(name='My first corpus', type=typeCorpus, user=me)
corpus.save() corpus.save()
# for i in range(64):
# title = 'Document #%d' % i
# Node(
# user = me,
# # type = self._document_nodetype,
# name = title,
# language = english,
# metadata = {'title':title},
# #resource = resource,
# type = typeDoc,
# parent = corpus
# ).save()
corpus.children.all().delete() corpus.children.all().delete()
corpus.add_resource(file='./data_samples/pubmed.zip', type=typePubmed) corpus.add_resource(file='./data_samples/pubmed.zip', type=typePubmed)
...@@ -50,5 +39,11 @@ corpus.parse_resources() ...@@ -50,5 +39,11 @@ corpus.parse_resources()
cache = Caches() cache = Caches()
for child in corpus.children.all(): for child in corpus.children.all():
print('#%d\t%s\n%s\n\n' % (child.id, child.name, child.metadata['abstract'])) if child.language:
# child.extract_ngrams(['title'], cache) print('#%d\t%s\n%s\n' % (child.id, child.name, child.language.fullname))
\ No newline at end of file else:
print('#%d\t%s\n\n' % (child.id, child.name))
# print(child.metadata)
# print()
child.extract_ngrams(['title', 'abstract'], cache)
# child.extract_ngrams({'title':1., 'abstract':.2}, cache)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment