Commit eeb90a41 authored by Mathieu Rodic's avatar Mathieu Rodic

[OPTI] Stopped sorting values in 'text' metadata fields

[CODE] Added 'MatInit.py', so we have some starting data
parent a7330cb9
# Without this, we couldn't use the Django environment
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext_web.settings")
os.environ.setdefault("DJANGO_HSTORE_GLOBAL_REGISTER", "False")
# We're gonna use all the models!
from node.models import *
# Node.objects.get(id=26514).children.all().make_metadata_filterable()
# exit()
# Reset: all data
tables_to_empty = [
Node,
Node_Metadata,
Metadata,
NodeType,
ResourceType,
Resource,
]
for table in tables_to_empty:
print('Empty table "%s"...' % (table._meta.db_table, ))
table.objects.all().delete()
# Integration: metadata types
print('Initialize metadata...')
metadata = {
'publication_date': 'datetime',
'authors': 'string',
'language_fullname': 'string',
'abstract': 'text',
'title': 'string',
'source': 'string',
'volume': 'string',
'text': 'text',
'date': 'datetime',
'page': 'string',
'doi': 'string',
'journal': 'string',
}
for name, type in metadata.items():
Metadata(name=name, type=type).save()
# Integration: languages
print('Initialize languages...')
import pycountry
Language.objects.all().delete()
for language in pycountry.languages:
if 'alpha2' in language.__dict__:
Language(
iso2 = language.alpha2,
iso3 = language.bibliographic,
fullname = language.name,
implemented = 1 if language.alpha2 in ['en', 'fr'] else 0,
).save()
english = Language.objects.get(iso2='en')
french = Language.objects.get(iso2='fr')
# Integration: users
print('Initialize users...')
try:
me = User.objects.get(username='mat')
except:
me = User(username='mat')
me.save()
# Integration: node types
print('Initialize node types...')
try:
typeProject = NodeType.objects.get(name='Project')
except Exception as error:
print(error)
typeProject = NodeType(name='Project')
typeProject.save()
try:
typeCorpus = NodeType.objects.get(name='Corpus')
except Exception as error:
print(error)
typeCorpus = NodeType(name='Corpus')
typeCorpus.save()
try:
typeDoc = NodeType.objects.get(name='Document')
except Exception as error:
print(error)
typeDoc = NodeType(name='Document')
typeDoc.save()
# Integration: resource types
print('Initialize resource...')
try:
typePubmed = ResourceType.objects.get(name='pubmed')
typeIsi = ResourceType.objects.get(name='isi')
typeRis = ResourceType.objects.get(name='ris')
typePresse = ResourceType.objects.get(name='europress')
except Exception as error:
print(error)
typePubmed = ResourceType(name='pubmed')
typePubmed.save()
typeIsi = ResourceType(name='isi')
typeIsi.save()
typeRis = ResourceType(name='ris')
typeRis.save()
typePresse = ResourceType(name='europress')
typePresse.save()
# Integration: project
print('Initialize project...')
try:
project = Node.objects.get(name='Bees project')
except:
project = Node(name='Bees project', type=typeProject, user=me)
project.save()
# Integration: corpus
print('Initialize corpus...')
try:
corpus_pubmed = Node.objects.get(name='PubMed corpus')
except:
corpus_pubmed = Node(parent=project, name='PubMed corpus', type=typeCorpus, user=me)
corpus_pubmed.save()
print('Initialize resource...')
corpus_pubmed.add_resource(
# file='./data_samples/pubmed.zip',
file='./data_samples/pubmed_2013-04-01_HoneyBeesBeeBees.xml',
type=typePubmed,
user=me
)
for resource in corpus_pubmed.get_resources():
print('Resource #%d - %s - %s' % (resource.id, resource.digest, resource.file))
print('Parse corpus #%d...' % (corpus_pubmed.id, ))
corpus_pubmed.parse_resources(verbose=True)
print('Extract corpus #%d...' % (corpus_pubmed.id, ))
corpus_pubmed.children.all().extract_ngrams(['title',])
print('Parsed corpus #%d.' % (corpus_pubmed.id, ))
exit()
\ No newline at end of file
......@@ -6,7 +6,7 @@ from django.db.models import Avg, Max, Min, Count, Sum
# from node.models import Language, ResourceType, Resource
# from node.models import Node, NodeType, Node_Resource, Project, Corpus
from sqlalchemy import text
from sqlalchemy import text, distinct
from sqlalchemy.sql import func
from sqlalchemy.orm import aliased
......@@ -18,6 +18,7 @@ Ngram = node.models.Ngram.sa
Metadata = node.models.Metadata.sa
Node_Metadata = node.models.Node_Metadata.sa
# for debugging only
def literalquery(statement, dialect=None):
"""Generate an SQL expression string with bound parameters rendered inline
for the given SQLAlchemy statement.
......@@ -62,6 +63,7 @@ def literalquery(statement, dialect=None):
return LiteralCompiler(dialect, statement)
# this one should prove itself useless quite soon
def get_connection():
import sqlalchemy.orm
from django.db import connections
......@@ -72,9 +74,7 @@ def get_connection():
engine = get_engine()
return engine.connect()
# connection = engine.connect()
# for recursive queries
# _sql_cte = '''
# WITH RECURSIVE cte ("depth", "path", "ordering", "id") AS (
# SELECT 1 AS depth,
......@@ -103,7 +103,7 @@ import json
def JsonHttpResponse(data, status=200):
return HttpResponse(
content = json.dumps(data, indent=4),
content_type = "application/json",
content_type = 'application/json; charset=utf-8',
status = status
)
Http400 = SuspiciousOperation
......@@ -122,9 +122,6 @@ def CsvHttpResponse(data, headers=None, status=200):
writer.writerow(row)
return response
Http400 = SuspiciousOperation
Http403 = PermissionDenied
_ngrams_order_columns = {
"frequency" : "-count",
"alphabetical" : "terms"
......@@ -223,30 +220,38 @@ class CorpusController:
.group_by(Metadata)
)
# build a collection with the metadata ekys
# build a collection with the metadata keys
collection = []
for metadata in metadata_query:
valuesCount = 0
values = None
# count values
value_column = getattr(Node_Metadata, 'value_' + metadata.type)
node_metadata_query = (Node_Metadata
.query(value_column)
.join(Node, Node.id == Node_Metadata.node_id)
.filter(Node.parent_id == node_id)
.filter(Node_Metadata.metadata_id == metadata.id)
.group_by(value_column)
.order_by(value_column)
)
valuesCount = node_metadata_query.count()
# if there is less than 32 values, retrieve them
valuesCount = None
if metadata.type != 'text':
value_column = getattr(Node_Metadata, 'value_' + metadata.type)
node_metadata_query = (Node_Metadata
.query(value_column)
.join(Node, Node.id == Node_Metadata.node_id)
.filter(Node.parent_id == node_id)
.filter(Node_Metadata.metadata_id == metadata.id)
.group_by(value_column)
.order_by(value_column)
)
valuesCount = node_metadata_query.count()
# if there is less than 32 values, give them
values = None
if valuesCount <= 32:
if isinstance(valuesCount, int) and valuesCount <= 32:
values = [row[0] for row in node_metadata_query.all()]
if metadata.type == 'datetime':
values = []
values = map(lambda x: x.isoformat(), values)
# adding this metadata to the collection
collection.append({
'key': metadata.name,
'type': metadata.type,
'values': values,
'valuesCount': valuesCount,
})
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment