Commit 2fbda243 authored by Administrator's avatar Administrator

[FEAT] Ngrams with language and tags manyToMany Fields.

parent 1283a347
from gargantext_web.db import *
# Instantiante table NgramTag:
f = open("/srv/gargantext/init/part_of_speech_labels.txt", 'r')
for line in f.readlines():
name, description = line.strip().split('\t')
_tag = Tag(name=name, description=description)
session.add(_tag)
session.commit()
f.close()
CC Coordinating conjunction
CD Cardinal number
DT Determiner
EX Existential there
FW Foreign word
IN Preposition or subordinating conjunction
JJ Adjective
JJR Adjective, comparative
JJS Adjective, superlative
LS List item marker
MD Modal
NN Noun, singular or mass
NNS Noun, plural
NNP Proper noun, singular
NNPS Proper noun, plural
PDT Predeterminer
POS Possessive ending
PRP Personal pronoun
PRP$ Possessive pronoun
RB Adverb
RBR Adverb, comparative
RBS Adverb, superlative
RP Particle
SYM Symbol
TO to
UH Interjection
VB Verb, base form
VBD Verb, past tense
VBG Verb, gerund or present participle
VBN Verb, past participle
VBP Verb, non­3rd person singular present
VBZ Verb, 3rd person singular present
WDT Wh­determiner
WP Wh­pronoun
WP$ Possessive wh­pronoun
WRB Wh­adverb
NGRA Ngram
......@@ -52,20 +52,34 @@ class ResourceType(models.Model):
def __str__(self):
return self.name
class NgramTag(models.Model):
tag = models.CharField(max_length=4, unique=True)
class Tag(models.Model):
name = models.CharField(max_length=4, unique=True)
description = models.CharField(max_length=255, unique=True)
class Ngram(models.Model):
language = models.ForeignKey(Language, blank=True, null=True, on_delete=models.SET_NULL)
language = models.ManyToManyField(blank=True, null=True, through='NgramLanguage', to='Language')
n = models.IntegerField()
terms = models.CharField(max_length=255, unique=True)
nodes = models.ManyToManyField(through='Node_Ngram', to='Node')
tag = models.ForeignKey(NgramTag, blank=True, null=True)
tag = models.ManyToManyField(blank=True, null=True, through='NgramTag', to='Tag')
def __str__(self):
return self.terms
class NgramTag(models.Model):
ngram = models.ForeignKey(Ngram, on_delete=models.CASCADE)
tag = models.ForeignKey(Tag)
def __str__(self):
return "%s: %s" % (self.ngram.terms, self.tag.name)
class NgramLanguage(models.Model):
ngram = models.ForeignKey(Ngram, on_delete=models.CASCADE)
language = models.ForeignKey(Language)
def __str__(self):
return "%s: %s" % (self.ngram.terms, self.language.fullname)
class Resource(models.Model):
user = models.ForeignKey(User)
......
......@@ -211,7 +211,11 @@ def extract_ngrams(corpus, keys):
language.id: language.iso2
for language in session.query(Language)
}
ngrams_data = set()
ngrams_language_data = set()
ngrams_tag_data = set()
node_ngram_list = defaultdict(lambda: defaultdict(int))
for nodeinfo in metadata_query:
node_id = nodeinfo[0]
......@@ -227,12 +231,25 @@ def extract_ngrams(corpus, keys):
if text is not None and len(text):
ngrams = ngramsextractor.extract_ngrams(text.replace("[","").replace("]",""))
for ngram in ngrams:
terms = ' '.join([token for token, tag in ngram]).lower()
n = len(ngram)
terms = ' '.join([token for token, tag in ngram]).lower()
# TODO BUG here
if n == 1:
tag_id = cache.Tag[ngram[0][1]].id
#tag_id = 1
#print('tag_id', tag_id)
elif n > 1:
tag_id = cache.Tag['NN'].id
#tag_id = 14
#print('tag_id_2', tag_id)
node_ngram_list[node_id][terms] += 1
ngrams_data.add(
(n, terms)
)
ngrams_data.add((n, terms))
ngrams_language_data.add((terms, language_id))
ngrams_tag_data.add((terms, tag_id))
# insert ngrams to temporary table
dbg.show('find ids for the %d ngrams' % len(ngrams_data))
db, cursor = get_cursor()
......@@ -256,6 +273,7 @@ def extract_ngrams(corpus, keys):
ngram.terms = tmp__ngrams.terms
''' % (Ngram.__table__.name, ))
# insert, then get the ids back
cursor.execute('''
INSERT INTO
%s (n, terms)
......@@ -266,6 +284,8 @@ def extract_ngrams(corpus, keys):
WHERE
id IS NULL
''' % (Ngram.__table__.name, ))
cursor.execute('''
UPDATE
tmp__ngrams
......@@ -278,11 +298,13 @@ def extract_ngrams(corpus, keys):
AND
tmp__ngrams.id IS NULL
''' % (Ngram.__table__.name, ))
# get all ids
ngram_ids = dict()
cursor.execute('SELECT id, terms FROM tmp__ngrams')
for row in cursor.fetchall():
ngram_ids[row[1]] = row[0]
#
dbg.show('insert associations')
node_ngram_data = list()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment