Commit b96a7eec authored by Mathieu Rodic's avatar Mathieu Rodic

[FEATURE] The ngrams extractor on Node seems to be working.

parent 755a8d4d
......@@ -129,18 +129,21 @@ class Node(CTENode):
if isinstance(keys, dict):
for key, weight in keys.items():
for ngram in extractor.extract_ngrams(self.metadata[key]):
associations[key] += weight
terms = ' '.join([token for token, tag in ngram])
associations[ngram] += weight
else:
for key in keys:
for ngram in extractor.extract_ngrams(self.metadata[key]):
associations[key] += 1
terms = ' '.join([token for token, tag in ngram])
associations[terms] += 1
# insert the occurrences in the database
# TODO: use bulk_create instead
for ngram_text, weight in associations.items():
Node_Ngram(
node = self,
ngram = ngrams[ngram_text],
weight = weight
)
).save()
class Node_Resource(models.Model):
......
......@@ -42,8 +42,10 @@ class NgramsExtractorsCache(defaultdict):
# format the language
if isinstance(key, str):
language = key.strip().lower()
else:
elif key:
language = key.iso2
else:
language = None
# find the proper extractor
if language in ["en", "eng", "english"]:
Extractor = EnglishNgramsExtractor
......@@ -66,15 +68,14 @@ class NgramsExtractorsCache(defaultdict):
class LanguagesCache(defaultdict):
def __init__(self):
for language in node.models.Language.objects.all():
self[language.iso2.lower()] = language
self[language.iso3.lower()] = language
self[language.fullname.lower()] = language
def __missing__(self, key):
if len(self) == 0:
for language in node.models.Language.objects.all():
self[str(language.iso2.lower())] = language
self[str(language.iso3.lower())] = language
self[str(language.fullname.lower())] = language
betterKey = key.strip().lower()
self[key] = self[betterKey] if betterKey in self else None
self[key] = self[betterKey] if betterKey in self.keys() else None
return self[betterKey]
......
......@@ -69,8 +69,9 @@ class FileParser:
"""format the languages found in the metadata."""
language = None
for key in ["fullname", "iso3", "iso2"]:
if key in metadata:
language_symbol = metadata["language_" + key]
language_key = "language_" + key
if language_key in metadata:
language_symbol = metadata[language_key]
language = self._languages_cache[language_symbol]
if language:
break
......
......@@ -31,18 +31,7 @@ try:
except:
corpus = Node(name='My first corpus', type=typeCorpus, user=me)
corpus.save()
# for i in range(64):
# title = 'Document #%d' % i
# Node(
# user = me,
# # type = self._document_nodetype,
# name = title,
# language = english,
# metadata = {'title':title},
# #resource = resource,
# type = typeDoc,
# parent = corpus
# ).save()
corpus.children.all().delete()
corpus.add_resource(file='./data_samples/pubmed.zip', type=typePubmed)
......@@ -50,5 +39,11 @@ corpus.parse_resources()
cache = Caches()
for child in corpus.children.all():
print('#%d\t%s\n%s\n\n' % (child.id, child.name, child.metadata['abstract']))
# child.extract_ngrams(['title'], cache)
\ No newline at end of file
if child.language:
print('#%d\t%s\n%s\n' % (child.id, child.name, child.language.fullname))
else:
print('#%d\t%s\n\n' % (child.id, child.name))
# print(child.metadata)
# print()
child.extract_ngrams(['title', 'abstract'], cache)
# child.extract_ngrams({'title':1., 'abstract':.2}, cache)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment