Commit eeb90a41 authored by Mathieu Rodic's avatar Mathieu Rodic

[OPTI] Stopped sorting values in 'text' metadata fields

[CODE] Added 'MatInit.py', so we have some starting data
parent a7330cb9
# Without this, we couldn't use the Django environment
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext_web.settings")
os.environ.setdefault("DJANGO_HSTORE_GLOBAL_REGISTER", "False")
# We're gonna use all the models!
from node.models import *
# Node.objects.get(id=26514).children.all().make_metadata_filterable()
# exit()
# Reset: all data
tables_to_empty = [
Node,
Node_Metadata,
Metadata,
NodeType,
ResourceType,
Resource,
]
for table in tables_to_empty:
print('Empty table "%s"...' % (table._meta.db_table, ))
table.objects.all().delete()
# Integration: metadata types
print('Initialize metadata...')
metadata = {
'publication_date': 'datetime',
'authors': 'string',
'language_fullname': 'string',
'abstract': 'text',
'title': 'string',
'source': 'string',
'volume': 'string',
'text': 'text',
'date': 'datetime',
'page': 'string',
'doi': 'string',
'journal': 'string',
}
for name, type in metadata.items():
Metadata(name=name, type=type).save()
# Integration: languages
print('Initialize languages...')
import pycountry
Language.objects.all().delete()
for language in pycountry.languages:
if 'alpha2' in language.__dict__:
Language(
iso2 = language.alpha2,
iso3 = language.bibliographic,
fullname = language.name,
implemented = 1 if language.alpha2 in ['en', 'fr'] else 0,
).save()
english = Language.objects.get(iso2='en')
french = Language.objects.get(iso2='fr')
# Integration: users
print('Initialize users...')
try:
me = User.objects.get(username='mat')
except:
me = User(username='mat')
me.save()
# Integration: node types
print('Initialize node types...')
try:
typeProject = NodeType.objects.get(name='Project')
except Exception as error:
print(error)
typeProject = NodeType(name='Project')
typeProject.save()
try:
typeCorpus = NodeType.objects.get(name='Corpus')
except Exception as error:
print(error)
typeCorpus = NodeType(name='Corpus')
typeCorpus.save()
try:
typeDoc = NodeType.objects.get(name='Document')
except Exception as error:
print(error)
typeDoc = NodeType(name='Document')
typeDoc.save()
# Integration: resource types
print('Initialize resource...')
try:
typePubmed = ResourceType.objects.get(name='pubmed')
typeIsi = ResourceType.objects.get(name='isi')
typeRis = ResourceType.objects.get(name='ris')
typePresse = ResourceType.objects.get(name='europress')
except Exception as error:
print(error)
typePubmed = ResourceType(name='pubmed')
typePubmed.save()
typeIsi = ResourceType(name='isi')
typeIsi.save()
typeRis = ResourceType(name='ris')
typeRis.save()
typePresse = ResourceType(name='europress')
typePresse.save()
# Integration: project
print('Initialize project...')
try:
project = Node.objects.get(name='Bees project')
except:
project = Node(name='Bees project', type=typeProject, user=me)
project.save()
# Integration: corpus
print('Initialize corpus...')
try:
corpus_pubmed = Node.objects.get(name='PubMed corpus')
except:
corpus_pubmed = Node(parent=project, name='PubMed corpus', type=typeCorpus, user=me)
corpus_pubmed.save()
print('Initialize resource...')
corpus_pubmed.add_resource(
# file='./data_samples/pubmed.zip',
file='./data_samples/pubmed_2013-04-01_HoneyBeesBeeBees.xml',
type=typePubmed,
user=me
)
for resource in corpus_pubmed.get_resources():
print('Resource #%d - %s - %s' % (resource.id, resource.digest, resource.file))
print('Parse corpus #%d...' % (corpus_pubmed.id, ))
corpus_pubmed.parse_resources(verbose=True)
print('Extract corpus #%d...' % (corpus_pubmed.id, ))
corpus_pubmed.children.all().extract_ngrams(['title',])
print('Parsed corpus #%d.' % (corpus_pubmed.id, ))
exit()
\ No newline at end of file
...@@ -6,7 +6,7 @@ from django.db.models import Avg, Max, Min, Count, Sum ...@@ -6,7 +6,7 @@ from django.db.models import Avg, Max, Min, Count, Sum
# from node.models import Language, ResourceType, Resource # from node.models import Language, ResourceType, Resource
# from node.models import Node, NodeType, Node_Resource, Project, Corpus # from node.models import Node, NodeType, Node_Resource, Project, Corpus
from sqlalchemy import text from sqlalchemy import text, distinct
from sqlalchemy.sql import func from sqlalchemy.sql import func
from sqlalchemy.orm import aliased from sqlalchemy.orm import aliased
...@@ -18,6 +18,7 @@ Ngram = node.models.Ngram.sa ...@@ -18,6 +18,7 @@ Ngram = node.models.Ngram.sa
Metadata = node.models.Metadata.sa Metadata = node.models.Metadata.sa
Node_Metadata = node.models.Node_Metadata.sa Node_Metadata = node.models.Node_Metadata.sa
# for debugging only
def literalquery(statement, dialect=None): def literalquery(statement, dialect=None):
"""Generate an SQL expression string with bound parameters rendered inline """Generate an SQL expression string with bound parameters rendered inline
for the given SQLAlchemy statement. for the given SQLAlchemy statement.
...@@ -62,6 +63,7 @@ def literalquery(statement, dialect=None): ...@@ -62,6 +63,7 @@ def literalquery(statement, dialect=None):
return LiteralCompiler(dialect, statement) return LiteralCompiler(dialect, statement)
# this one should prove itself useless quite soon
def get_connection(): def get_connection():
import sqlalchemy.orm import sqlalchemy.orm
from django.db import connections from django.db import connections
...@@ -72,9 +74,7 @@ def get_connection(): ...@@ -72,9 +74,7 @@ def get_connection():
engine = get_engine() engine = get_engine()
return engine.connect() return engine.connect()
# for recursive queries
# connection = engine.connect()
# _sql_cte = ''' # _sql_cte = '''
# WITH RECURSIVE cte ("depth", "path", "ordering", "id") AS ( # WITH RECURSIVE cte ("depth", "path", "ordering", "id") AS (
# SELECT 1 AS depth, # SELECT 1 AS depth,
...@@ -103,7 +103,7 @@ import json ...@@ -103,7 +103,7 @@ import json
def JsonHttpResponse(data, status=200): def JsonHttpResponse(data, status=200):
return HttpResponse( return HttpResponse(
content = json.dumps(data, indent=4), content = json.dumps(data, indent=4),
content_type = "application/json", content_type = 'application/json; charset=utf-8',
status = status status = status
) )
Http400 = SuspiciousOperation Http400 = SuspiciousOperation
...@@ -122,9 +122,6 @@ def CsvHttpResponse(data, headers=None, status=200): ...@@ -122,9 +122,6 @@ def CsvHttpResponse(data, headers=None, status=200):
writer.writerow(row) writer.writerow(row)
return response return response
Http400 = SuspiciousOperation
Http403 = PermissionDenied
_ngrams_order_columns = { _ngrams_order_columns = {
"frequency" : "-count", "frequency" : "-count",
"alphabetical" : "terms" "alphabetical" : "terms"
...@@ -223,30 +220,38 @@ class CorpusController: ...@@ -223,30 +220,38 @@ class CorpusController:
.group_by(Metadata) .group_by(Metadata)
) )
# build a collection with the metadata ekys # build a collection with the metadata keys
collection = [] collection = []
for metadata in metadata_query: for metadata in metadata_query:
valuesCount = 0
values = None
# count values # count values
value_column = getattr(Node_Metadata, 'value_' + metadata.type) valuesCount = None
node_metadata_query = (Node_Metadata if metadata.type != 'text':
.query(value_column) value_column = getattr(Node_Metadata, 'value_' + metadata.type)
.join(Node, Node.id == Node_Metadata.node_id) node_metadata_query = (Node_Metadata
.filter(Node.parent_id == node_id) .query(value_column)
.filter(Node_Metadata.metadata_id == metadata.id) .join(Node, Node.id == Node_Metadata.node_id)
.group_by(value_column) .filter(Node.parent_id == node_id)
.order_by(value_column) .filter(Node_Metadata.metadata_id == metadata.id)
) .group_by(value_column)
valuesCount = node_metadata_query.count() .order_by(value_column)
# if there is less than 32 values, retrieve them )
valuesCount = node_metadata_query.count()
# if there is less than 32 values, give them
values = None values = None
if valuesCount <= 32: if isinstance(valuesCount, int) and valuesCount <= 32:
values = [row[0] for row in node_metadata_query.all()] values = [row[0] for row in node_metadata_query.all()]
if metadata.type == 'datetime': if metadata.type == 'datetime':
values = [] values = []
values = map(lambda x: x.isoformat(), values) values = map(lambda x: x.isoformat(), values)
# adding this metadata to the collection
collection.append({ collection.append({
'key': metadata.name, 'key': metadata.name,
'type': metadata.type,
'values': values, 'values': values,
'valuesCount': valuesCount, 'valuesCount': valuesCount,
}) })
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment