Commit 29f9c8a0 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TUTO] Philomemy Notebook created with main functions to explorer the subject...

[TUTO] Philomemy Notebook created with main functions to explorer the subject in collaboration with David.
parent 11255619
......@@ -23,7 +23,7 @@ from datetime import datetime
def t():
return datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
def compute_occs(corpus, overwrite_id = None, groupings_id = None, year=None, start=None, end=None, interactiv=False):
"""
Calculates sum of occs per ngram (or per mainform if groups) within corpus
(used as info in the ngrams table view)
......@@ -61,6 +61,8 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
.group_by(NodeNgram.ngram_id)
)
if year is not None:
occs_q = occs_q.filter(Node.hyperdata["publication_year"].astext == str(year))
# difficult case: with groups
# ------------
......@@ -108,6 +110,10 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
# for the sum
.group_by("counted_form")
)
if year is not None:
occs_q = occs_q.filter(Node.hyperdata["publication_year"].astext == str(year))
#print(str(occs_q.all()))
occ_sums = occs_q.all()
......@@ -134,13 +140,17 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
# £TODO make it NodeNgram instead NodeNodeNgram ! and rebase :/
# (idem ti_ranking)
bulk_insert(
NodeNodeNgram,
('node1_id' , 'node2_id', 'ngram_id', 'score'),
((the_id, corpus.id, res[0], res[1]) for res in occ_sums)
)
return the_id
if interactiv is False :
bulk_insert(
NodeNodeNgram,
('node1_id' , 'node2_id', 'ngram_id', 'score'),
((the_id, corpus.id, res[0], res[1]) for res in occ_sums)
)
return the_id
else :
return [(res[0], res[1]) for res in occ_sums]
def compute_ti_ranking(corpus,
......
......@@ -20,6 +20,7 @@ def compute_coocs( corpus,
stoplist_id = None,
start = None,
end = None,
year = None,
symmetry_filter = False,
diagonal_filter = True):
"""
......@@ -97,14 +98,21 @@ def compute_coocs( corpus,
WHERE
n.typename = {nodetype_id}
AND n.parent_id = {corpus_id}
""".format( nodetype_id = NODETYPES.index('DOCUMENT')
, corpus_id=corpus.id
)
if year :
cooc_filter_sql += """
AND n.hyperdata -> 'publication_year' = '{year}'
""".format( year=str(year))
cooc_filter_sql += """
GROUP BY 1,2
-- ==
-- GROUP BY ngA, ngB
)
""".format( nodetype_id = NODETYPES.index('DOCUMENT')
, corpus_id=corpus.id
)
"""
# 3) taking the cooccurrences of ngram x2
ngram_filter_A_sql += """
-- STEP 1: X axis of the matrix
......
......@@ -230,6 +230,7 @@ def countCooccurrences( corpus_id=None , cooc_id=None
session.commit()
#data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness)
#return data
else:
return cooc
return(coocNode.id, cooc)
......@@ -25,7 +25,7 @@ from django.http import Http404
# Import those to be available by notebook user
from langdetect import detect as detect_lang
from gargantext.models import UserNode, User
import functools
class NotebookError(Exception):
pass
......@@ -40,8 +40,11 @@ def documents(corpus_id):
#import seaborn as sns
import pandas as pd
def countByField(docs, field):
return list(Counter([doc.hyperdata[field] for doc in docs]).items())
def chart(docs, field):
year_publis = list(Counter([doc.hyperdata[field] for doc in docs]).items())
year_publis = countByField(docs, field)
frame0 = pd.DataFrame(year_publis, columns=['Date', 'DateValue'])
frame1 = pd.DataFrame(year_publis, columns=['Date', 'DateValue'], index=frame0.Date)
return frame1
......
......@@ -12,7 +12,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {
"collapsed": true,
"deletable": true,
......@@ -26,7 +26,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {
"collapsed": true,
"deletable": true,
......@@ -41,6 +41,1392 @@
"%matplotlib inline "
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"source": [
"# Philomemies"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Instantiate the corpus you are working on"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"My corpus id is : 302695.\n"
]
}
],
"source": [
"corpus_url = \"http://localhost:8000/projects/302694/corpora/302695/\"\n",
"corpus_id = corpus_url.split(\"/\")[6]\n",
"print(\"My corpus id is : %s.\" % corpus_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Getting the Map Terms "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(21, 'environment'), (42, 'development'), (184, 'examples'), (196, 'water'), (368, 'problem'), (576, 'work'), (654, 'technology'), (712, 'number'), (738, 'operation'), (817, 'experiments')]\n"
]
}
],
"source": [
"from gargantext.models import *\n",
"import csv\n",
"\n",
"map_id = session.query(MaplistNode.id).filter(MaplistNode.parent_id == corpus_id).first()\n",
"\n",
"mapTerms = (session.query(Ngram).join( NodeNgram, NodeNgram.ngram_id == Ngram.id)\n",
" .filter(NodeNgram.node_id == map_id)\n",
" .all()\n",
" )\n",
"\n",
"print([(m.id, m.terms) for m in mapTerms[:10]])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save in CSV File"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"toPrint = [(m.id,m.terms) for m in mapTerms]\n",
"csvfile = \"./MapTerms.csv\"\n",
"\n",
"#Assuming res is a flat list\n",
"with open(csvfile, \"w\") as output:\n",
" writer = csv.writer(output, lineterminator='\\n')\n",
" for val in toPrint:\n",
" writer.writerow([val])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Next:\n",
"# You can have access to your CSV file in the home of you Notebook!\n",
"# Click, rename, mv, delete in your Notebook\n",
"\n",
"#Assuming output is a list of lists\n",
"#with open(csvfile, \"w\") as output:\n",
"# writer = csv.writer(output, lineterminator='\\n')\n",
"# writer.writerows(res)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Occurrences of MapTerms by Year"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(21370, 4.0),\n",
" (59430, 1.0),\n",
" (651305, 1.0),\n",
" (2360749, 1.0),\n",
" (1449939, 1.0),\n",
" (643027, 1.0),\n",
" (2364476, 1.0),\n",
" (2360737, 1.0),\n",
" (2365155, 1.0),\n",
" (2363638, 1.0),\n",
" (1443297, 2.0),\n",
" (1384982, 1.0),\n",
" (2360189, 1.0),\n",
" (525938, 1.0),\n",
" (2362296, 1.0),\n",
" (1411893, 1.0),\n",
" (2361160, 2.0),\n",
" (2362533, 1.0),\n",
" (499432, 2.0),\n",
" (734655, 1.0),\n",
" (2363202, 1.0),\n",
" (635348, 1.0),\n",
" (2365036, 1.0),\n",
" (2360700, 1.0),\n",
" (2362383, 1.0),\n",
" (567202, 1.0),\n",
" (2177469, 1.0),\n",
" (1422236, 1.0),\n",
" (2361517, 1.0),\n",
" (598620, 2.0),\n",
" (2364289, 1.0),\n",
" (1629967, 1.0),\n",
" (731546, 1.0),\n",
" (680861, 1.0),\n",
" (2363196, 1.0),\n",
" (2358884, 4.0),\n",
" (674406, 2.0),\n",
" (13012, 3.0),\n",
" (2360148, 1.0),\n",
" (622538, 1.0),\n",
" (1585366, 2.0),\n",
" (647149, 1.0),\n",
" (2358559, 1.0),\n",
" (2365513, 2.0),\n",
" (512496, 3.0),\n",
" (2365043, 1.0),\n",
" (2359304, 1.0),\n",
" (2362593, 1.0),\n",
" (513711, 3.0),\n",
" (492233, 1.0),\n",
" (64331, 3.0),\n",
" (2365676, 1.0),\n",
" (2360498, 1.0),\n",
" (445661, 1.0),\n",
" (2364442, 1.0),\n",
" (2362607, 1.0),\n",
" (806318, 1.0),\n",
" (1895463, 1.0),\n",
" (2359089, 1.0),\n",
" (2361007, 1.0),\n",
" (157477, 1.0),\n",
" (2364599, 1.0),\n",
" (2364315, 2.0),\n",
" (2360787, 2.0),\n",
" (57317, 3.0),\n",
" (514091, 8.0),\n",
" (31997, 1.0),\n",
" (2362566, 1.0),\n",
" (685604, 1.0),\n",
" (2170353, 1.0),\n",
" (501580, 1.0),\n",
" (2360868, 1.0),\n",
" (26, 25.0),\n",
" (2358702, 4.0),\n",
" (620585, 1.0),\n",
" (742237, 3.0),\n",
" (22883, 2.0),\n",
" (2362338, 1.0),\n",
" (1688637, 1.0),\n",
" (2362267, 1.0),\n",
" (2364837, 1.0),\n",
" (2360978, 1.0),\n",
" (501714, 1.0),\n",
" (2364106, 1.0),\n",
" (486294, 2.0),\n",
" (2120016, 1.0),\n",
" (2365812, 1.0),\n",
" (491783, 1.0),\n",
" (2362771, 1.0),\n",
" (2363111, 1.0),\n",
" (1433227, 1.0),\n",
" (2362738, 2.0),\n",
" (2360235, 1.0),\n",
" (2359671, 1.0),\n",
" (929961, 1.0),\n",
" (2360102, 1.0),\n",
" (2360196, 1.0),\n",
" (622587, 1.0),\n",
" (2365674, 1.0),\n",
" (2359850, 1.0),\n",
" (2362364, 1.0),\n",
" (2365945, 1.0),\n",
" (2360968, 1.0),\n",
" (2364469, 1.0),\n",
" (818337, 1.0),\n",
" (2364330, 1.0),\n",
" (851756, 1.0),\n",
" (481888, 2.0),\n",
" (2363028, 1.0),\n",
" (2362572, 1.0),\n",
" (619602, 1.0),\n",
" (480556, 2.0),\n",
" (617985, 1.0),\n",
" (2359001, 1.0),\n",
" (1530066, 3.0),\n",
" (2361189, 1.0),\n",
" (2365119, 1.0),\n",
" (2359622, 1.0),\n",
" (2358673, 1.0),\n",
" (1770959, 1.0),\n",
" (2359417, 1.0),\n",
" (2364799, 1.0),\n",
" (1625445, 1.0),\n",
" (2359191, 1.0),\n",
" (1637994, 1.0),\n",
" (2364004, 1.0),\n",
" (2365841, 1.0),\n",
" (2361921, 1.0),\n",
" (2363485, 1.0),\n",
" (2364956, 1.0),\n",
" (2363993, 1.0),\n",
" (703437, 1.0),\n",
" (2365657, 1.0),\n",
" (477579, 4.0),\n",
" (2364405, 1.0),\n",
" (931092, 1.0),\n",
" (16034, 1.0),\n",
" (55673, 3.0),\n",
" (83733, 1.0),\n",
" (632306, 8.0),\n",
" (2365015, 1.0),\n",
" (1380705, 1.0),\n",
" (2364241, 1.0),\n",
" (2361341, 1.0),\n",
" (2365226, 1.0),\n",
" (2360270, 3.0),\n",
" (2359257, 1.0),\n",
" (513664, 2.0),\n",
" (2363752, 1.0),\n",
" (2358578, 5.0),\n",
" (462354, 1.0),\n",
" (2364333, 2.0),\n",
" (2365625, 1.0),\n",
" (2136540, 1.0),\n",
" (438777, 1.0),\n",
" (1395914, 1.0),\n",
" (509545, 1.0),\n",
" (2360917, 1.0),\n",
" (2364219, 1.0),\n",
" (2361672, 1.0),\n",
" (919892, 1.0),\n",
" (2361169, 1.0),\n",
" (2363689, 1.0),\n",
" (631491, 3.0),\n",
" (1608035, 1.0),\n",
" (2363660, 2.0),\n",
" (2363106, 1.0),\n",
" (1324144, 1.0),\n",
" (5561, 14.0),\n",
" (2361420, 1.0),\n",
" (2364011, 1.0),\n",
" (1438416, 2.0),\n",
" (629048, 2.0),\n",
" (586132, 1.0),\n",
" (690740, 1.0),\n",
" (494644, 6.0),\n",
" (2359973, 1.0),\n",
" (2364755, 1.0),\n",
" (673739, 1.0),\n",
" (296, 1.0),\n",
" (926220, 2.0),\n",
" (807705, 1.0),\n",
" (528702, 1.0),\n",
" (16802, 4.0),\n",
" (2360888, 1.0),\n",
" (568435, 1.0),\n",
" (2359540, 1.0),\n",
" (1387358, 3.0),\n",
" (2359551, 3.0),\n",
" (497582, 43.0),\n",
" (1677149, 1.0),\n",
" (1355614, 4.0),\n",
" (1201745, 1.0),\n",
" (505837, 1.0),\n",
" (559722, 1.0),\n",
" (2365008, 2.0),\n",
" (2365846, 1.0),\n",
" (2360910, 1.0),\n",
" (1516185, 1.0),\n",
" (2365055, 1.0),\n",
" (2360713, 1.0),\n",
" (2363076, 1.0),\n",
" (2363231, 1.0),\n",
" (2361690, 1.0),\n",
" (1514633, 1.0),\n",
" (2361995, 1.0),\n",
" (2363636, 1.0),\n",
" (2363301, 1.0),\n",
" (440819, 1.0),\n",
" (2365719, 2.0),\n",
" (2362375, 1.0),\n",
" (735539, 1.0),\n",
" (2361324, 1.0),\n",
" (120180, 1.0),\n",
" (2170107, 1.0),\n",
" (2363634, 1.0),\n",
" (2362961, 1.0),\n",
" (2364791, 1.0),\n",
" (2360526, 1.0),\n",
" (1921124, 1.0),\n",
" (2364312, 2.0),\n",
" (2359118, 3.0),\n",
" (63107, 2.0),\n",
" (2361984, 2.0),\n",
" (499205, 1.0),\n",
" (8604, 13.0),\n",
" (2362915, 1.0),\n",
" (2363378, 1.0),\n",
" (720125, 3.0),\n",
" (302111, 1.0),\n",
" (655753, 1.0),\n",
" (735895, 2.0),\n",
" (2365447, 2.0),\n",
" (2360850, 1.0),\n",
" (689048, 2.0),\n",
" (445111, 1.0),\n",
" (503269, 2.0),\n",
" (2359395, 2.0),\n",
" (1405763, 1.0),\n",
" (829454, 1.0),\n",
" (2365278, 1.0),\n",
" (2362406, 1.0),\n",
" (2362394, 1.0),\n",
" (494627, 1.0),\n",
" (2362131, 1.0),\n",
" (2362087, 1.0),\n",
" (1353261, 2.0),\n",
" (2361179, 1.0),\n",
" (2362444, 2.0),\n",
" (2360429, 1.0),\n",
" (2362294, 1.0),\n",
" (469284, 2.0),\n",
" (1893049, 1.0),\n",
" (2365809, 3.0),\n",
" (2359723, 2.0),\n",
" (2363078, 1.0),\n",
" (2360239, 1.0),\n",
" (2362494, 1.0),\n",
" (1877521, 1.0),\n",
" (2360110, 4.0),\n",
" (2363186, 1.0),\n",
" (884258, 1.0),\n",
" (2359352, 1.0),\n",
" (2522, 3.0),\n",
" (2362417, 1.0),\n",
" (450837, 1.0),\n",
" (2364726, 1.0),\n",
" (2363699, 1.0),\n",
" (2364702, 1.0),\n",
" (2359174, 1.0),\n",
" (1963, 1.0),\n",
" (559468, 1.0),\n",
" (6118, 9.0),\n",
" (2359177, 2.0),\n",
" (2362514, 1.0),\n",
" (2362221, 1.0),\n",
" (2365090, 1.0),\n",
" (2365503, 1.0),\n",
" (527113, 1.0),\n",
" (2362930, 1.0),\n",
" (2362782, 1.0),\n",
" (2365635, 1.0),\n",
" (54751, 1.0),\n",
" (513650, 1.0),\n",
" (2362227, 2.0),\n",
" (608048, 1.0),\n",
" (2360822, 1.0),\n",
" (2365091, 1.0),\n",
" (2364883, 1.0),\n",
" (2362610, 1.0),\n",
" (620473, 9.0),\n",
" (1411038, 2.0),\n",
" (29247, 12.0),\n",
" (624176, 4.0),\n",
" (2364503, 1.0),\n",
" (7150, 1.0),\n",
" (2358794, 2.0),\n",
" (2361782, 1.0),\n",
" (2362586, 1.0),\n",
" (2360037, 1.0),\n",
" (1429116, 1.0),\n",
" (2359620, 1.0),\n",
" (923, 4.0),\n",
" (2361933, 1.0),\n",
" (2360660, 1.0),\n",
" (2365277, 2.0),\n",
" (2191553, 1.0),\n",
" (2364895, 2.0),\n",
" (2364275, 2.0),\n",
" (2361536, 1.0),\n",
" (2365404, 1.0),\n",
" (2359764, 1.0),\n",
" (1561871, 2.0),\n",
" (559320, 1.0),\n",
" (873327, 1.0),\n",
" (658039, 1.0),\n",
" (2359213, 2.0),\n",
" (2359535, 1.0),\n",
" (2361736, 1.0),\n",
" (2364559, 1.0),\n",
" (1623384, 1.0),\n",
" (30980, 1.0),\n",
" (750366, 1.0),\n",
" (20356, 1.0),\n",
" (2365921, 1.0),\n",
" (2152944, 1.0),\n",
" (587010, 1.0),\n",
" (849909, 11.0),\n",
" (14527, 1.0),\n",
" (8011, 35.0),\n",
" (2361030, 1.0),\n",
" (1545504, 1.0),\n",
" (2361015, 1.0),\n",
" (2365040, 1.0),\n",
" (1447721, 9.0),\n",
" (2362086, 1.0),\n",
" (2362995, 1.0),\n",
" (63843, 1.0),\n",
" (2365793, 1.0),\n",
" (21, 7.0),\n",
" (545578, 1.0),\n",
" (2362704, 1.0),\n",
" (2360704, 1.0),\n",
" (10704, 3.0),\n",
" (3942, 4.0),\n",
" (5270, 6.0),\n",
" (2361778, 1.0),\n",
" (2363553, 1.0),\n",
" (2364310, 1.0),\n",
" (1301103, 1.0),\n",
" (444719, 3.0),\n",
" (2359886, 1.0),\n",
" (2362677, 1.0),\n",
" (2359658, 1.0),\n",
" (2358746, 10.0),\n",
" (21645, 2.0),\n",
" (2360518, 1.0),\n",
" (2364300, 1.0),\n",
" (1387595, 1.0),\n",
" (2362101, 1.0),\n",
" (2364435, 1.0),\n",
" (2365058, 2.0),\n",
" (2359112, 1.0),\n",
" (2360899, 1.0),\n",
" (2362248, 1.0),\n",
" (854727, 1.0),\n",
" (1423016, 1.0),\n",
" (1413873, 1.0),\n",
" (2363707, 1.0),\n",
" (2363157, 1.0),\n",
" (2153, 7.0),\n",
" (934934, 1.0),\n",
" (616231, 1.0),\n",
" (511566, 1.0),\n",
" (2364500, 1.0),\n",
" (2361001, 1.0),\n",
" (1397541, 1.0),\n",
" (587884, 2.0),\n",
" (2365532, 1.0),\n",
" (8410, 1.0),\n",
" (827517, 1.0),\n",
" (19604, 1.0),\n",
" (2359015, 1.0),\n",
" (2359056, 1.0),\n",
" (2362183, 1.0),\n",
" (2365154, 1.0),\n",
" (2360190, 1.0),\n",
" (2358618, 1.0),\n",
" (463674, 38.0),\n",
" (1703021, 1.0),\n",
" (850864, 1.0),\n",
" (2361383, 1.0),\n",
" (2363051, 1.0),\n",
" (515051, 1.0),\n",
" (506340, 1.0),\n",
" (147281, 3.0),\n",
" (2359145, 6.0),\n",
" (2361831, 1.0),\n",
" (1307142, 1.0),\n",
" (2362005, 1.0),\n",
" (2362907, 1.0),\n",
" (2363086, 1.0),\n",
" (1780184, 1.0),\n",
" (2359967, 3.0),\n",
" (2341709, 1.0),\n",
" (2361449, 1.0),\n",
" (2195068, 1.0),\n",
" (1550538, 1.0),\n",
" (2359930, 1.0),\n",
" (2358556, 1.0),\n",
" (2359028, 1.0),\n",
" (2362000, 1.0),\n",
" (477488, 1.0),\n",
" (1325934, 1.0),\n",
" (2358872, 1.0),\n",
" (532439, 1.0),\n",
" (2359331, 1.0),\n",
" (2359288, 1.0),\n",
" (526473, 1.0),\n",
" (786352, 1.0),\n",
" (2362121, 1.0),\n",
" (29473, 2.0),\n",
" (2363837, 1.0),\n",
" (2364991, 1.0),\n",
" (2364888, 1.0),\n",
" (902377, 1.0),\n",
" (2363525, 1.0),\n",
" (2364401, 1.0),\n",
" (2365986, 1.0),\n",
" (2361401, 1.0),\n",
" (2365266, 1.0),\n",
" (1713272, 1.0),\n",
" (2359931, 1.0),\n",
" (506213, 1.0),\n",
" (2361843, 1.0),\n",
" (1694972, 1.0),\n",
" (590807, 1.0),\n",
" (2363469, 1.0),\n",
" (510679, 1.0),\n",
" (794150, 1.0),\n",
" (519092, 2.0),\n",
" (1733, 18.0),\n",
" (3061, 2.0),\n",
" (1585972, 1.0),\n",
" (742843, 1.0),\n",
" (520505, 1.0),\n",
" (2360506, 1.0),\n",
" (2364047, 1.0),\n",
" (2363234, 1.0),\n",
" (987, 5.0),\n",
" (509404, 1.0),\n",
" (1522832, 2.0),\n",
" (2359095, 1.0),\n",
" (1436961, 2.0),\n",
" (1201089, 2.0),\n",
" (2361240, 1.0),\n",
" (2362356, 1.0),\n",
" (2365630, 1.0),\n",
" (1602420, 1.0),\n",
" (2362337, 1.0),\n",
" (2364139, 1.0),\n",
" (2362046, 1.0),\n",
" (504418, 1.0),\n",
" (2152668, 1.0),\n",
" (2362102, 2.0),\n",
" (8096, 2.0),\n",
" (228091, 1.0),\n",
" (2365067, 1.0),\n",
" (2362173, 1.0),\n",
" (1521046, 3.0),\n",
" (2361475, 1.0),\n",
" (13387, 1.0),\n",
" (2364137, 1.0),\n",
" (2359308, 1.0),\n",
" (2360943, 1.0),\n",
" (1658105, 4.0),\n",
" (494569, 1.0),\n",
" (94, 1.0),\n",
" (55639, 2.0),\n",
" (2777, 2.0),\n",
" (418077, 1.0),\n",
" (62608, 1.0),\n",
" (2361594, 1.0),\n",
" (2358806, 1.0),\n",
" (482756, 1.0),\n",
" (2361127, 1.0),\n",
" (2364255, 1.0),\n",
" (2329826, 2.0),\n",
" (2361084, 2.0),\n",
" (2360560, 1.0),\n",
" (623059, 1.0),\n",
" (2445, 3.0),\n",
" (81429, 2.0),\n",
" (1179801, 1.0),\n",
" (2362862, 1.0),\n",
" (2361703, 1.0),\n",
" (2359312, 1.0),\n",
" (9826, 2.0),\n",
" (2364379, 1.0),\n",
" (527741, 1.0),\n",
" (2364189, 1.0),\n",
" (2359316, 2.0),\n",
" (584752, 9.0),\n",
" (1641794, 1.0),\n",
" (2365861, 1.0),\n",
" (1208011, 1.0),\n",
" (20970, 1.0),\n",
" (1937, 5.0),\n",
" (5359, 4.0),\n",
" (1752091, 1.0),\n",
" (1375448, 1.0),\n",
" (595143, 1.0),\n",
" (2364461, 1.0),\n",
" (5682, 1.0),\n",
" (2362063, 1.0),\n",
" (21879, 1.0),\n",
" (2360701, 1.0),\n",
" (2358977, 1.0),\n",
" (2361154, 1.0),\n",
" (2362340, 1.0),\n",
" (1785700, 1.0),\n",
" (2362842, 1.0),\n",
" (2359448, 1.0),\n",
" (457564, 1.0),\n",
" (8397, 13.0),\n",
" (2361431, 1.0),\n",
" (2365743, 1.0),\n",
" (1589760, 3.0),\n",
" (535634, 1.0),\n",
" (442566, 1.0),\n",
" (542422, 2.0),\n",
" (2362697, 1.0),\n",
" (439327, 4.0),\n",
" (1479888, 1.0),\n",
" (2363995, 1.0),\n",
" (2035, 8.0),\n",
" (20992, 2.0),\n",
" (2362680, 1.0),\n",
" (2362363, 1.0),\n",
" (2360139, 1.0),\n",
" (1767285, 1.0),\n",
" (676959, 5.0),\n",
" (2359645, 1.0),\n",
" (595179, 1.0),\n",
" (10269, 1.0),\n",
" (2359685, 1.0),\n",
" (2361384, 1.0),\n",
" (2364845, 1.0),\n",
" (2359606, 1.0),\n",
" (913230, 2.0),\n",
" (2361786, 1.0),\n",
" (2364482, 1.0),\n",
" (2358728, 1.0),\n",
" (1780966, 1.0),\n",
" (2358622, 1.0),\n",
" (2359594, 1.0),\n",
" (2360310, 1.0),\n",
" (455269, 1.0),\n",
" (2361842, 1.0),\n",
" (2358852, 1.0),\n",
" (2361900, 1.0),\n",
" (2358908, 1.0),\n",
" (2365963, 1.0),\n",
" (2359772, 1.0),\n",
" (2360319, 1.0),\n",
" (1317685, 1.0),\n",
" (2361684, 3.0),\n",
" (2363498, 1.0),\n",
" (2359707, 1.0),\n",
" (2364188, 1.0),\n",
" (2143737, 2.0),\n",
" (2362457, 1.0),\n",
" (512968, 1.0),\n",
" (2880, 2.0),\n",
" (2360412, 1.0),\n",
" (2361277, 1.0),\n",
" (1390970, 1.0),\n",
" (2365974, 1.0),\n",
" (2361896, 1.0),\n",
" (725235, 1.0),\n",
" (2362316, 1.0),\n",
" (2364158, 1.0),\n",
" (2365037, 1.0),\n",
" (502824, 1.0),\n",
" (2363295, 2.0),\n",
" (2363599, 1.0),\n",
" (2364585, 1.0),\n",
" (2365786, 1.0),\n",
" (536579, 2.0),\n",
" (2359141, 2.0),\n",
" (2359301, 1.0),\n",
" (2365386, 1.0),\n",
" (3009, 3.0),\n",
" (2364890, 1.0),\n",
" (59339, 1.0),\n",
" (2362906, 1.0),\n",
" (2119440, 1.0),\n",
" (2361640, 1.0),\n",
" (2364210, 1.0),\n",
" (2359236, 1.0),\n",
" (493981, 1.0),\n",
" (622177, 1.0),\n",
" (2365989, 1.0),\n",
" (1456511, 3.0),\n",
" (112504, 1.0),\n",
" (2363967, 2.0),\n",
" (2363633, 1.0),\n",
" (1513182, 1.0),\n",
" (2365117, 1.0),\n",
" (5332, 6.0),\n",
" (2360334, 1.0),\n",
" (2360666, 1.0),\n",
" (1642133, 4.0),\n",
" (2363528, 1.0),\n",
" (830264, 1.0),\n",
" (1509930, 1.0),\n",
" (7608, 1.0),\n",
" (2363558, 1.0),\n",
" (1435699, 1.0),\n",
" (2360637, 1.0),\n",
" (2360856, 1.0),\n",
" (2359505, 1.0),\n",
" (2363393, 1.0),\n",
" (3599, 1.0),\n",
" (11037, 1.0),\n",
" (578835, 1.0),\n",
" (2362787, 1.0),\n",
" (2363423, 1.0),\n",
" (2359353, 1.0),\n",
" (2362875, 1.0),\n",
" (2359700, 1.0),\n",
" (2165377, 1.0),\n",
" (2361553, 1.0),\n",
" (2363307, 5.0),\n",
" (2365987, 1.0),\n",
" (850295, 1.0),\n",
" (2365369, 1.0),\n",
" (2363897, 1.0),\n",
" (4825, 1.0),\n",
" (2251432, 1.0),\n",
" (456369, 1.0),\n",
" (2359058, 6.0),\n",
" (912625, 1.0),\n",
" (2359848, 1.0),\n",
" (2360533, 1.0),\n",
" (2156267, 1.0),\n",
" (2364731, 1.0),\n",
" (1416113, 1.0),\n",
" (2365228, 1.0),\n",
" (2361806, 1.0),\n",
" (2363276, 2.0),\n",
" (2364251, 1.0),\n",
" (2364515, 1.0),\n",
" (2359615, 2.0),\n",
" (2361776, 1.0),\n",
" (182859, 1.0),\n",
" (2363194, 1.0),\n",
" (2365020, 1.0),\n",
" (2364838, 1.0),\n",
" (2365848, 1.0),\n",
" (1641124, 1.0),\n",
" (2365690, 2.0),\n",
" (534591, 1.0),\n",
" (72938, 29.0),\n",
" (661363, 1.0),\n",
" (8973, 4.0),\n",
" (311226, 1.0),\n",
" (2359475, 1.0),\n",
" (829015, 1.0),\n",
" (2361777, 1.0),\n",
" (615301, 1.0),\n",
" (2362397, 1.0),\n",
" (509336, 1.0),\n",
" (603785, 1.0),\n",
" (610033, 1.0),\n",
" (2362519, 1.0),\n",
" (2360994, 1.0),\n",
" (1500460, 1.0),\n",
" (1587560, 2.0),\n",
" (2362004, 1.0),\n",
" (2365875, 1.0),\n",
" (2362539, 2.0),\n",
" (2363704, 1.0),\n",
" (2364974, 1.0),\n",
" (2361217, 1.0),\n",
" (2361682, 1.0),\n",
" (62444, 1.0),\n",
" (2360507, 1.0),\n",
" (2360515, 1.0),\n",
" (1891144, 1.0),\n",
" (2361650, 1.0),\n",
" (2363585, 1.0),\n",
" (8861, 1.0),\n",
" (669920, 2.0),\n",
" (2364078, 1.0),\n",
" (2363179, 1.0),\n",
" (2364103, 1.0),\n",
" (2360001, 2.0),\n",
" (1553516, 2.0),\n",
" (13863, 3.0),\n",
" (606638, 1.0),\n",
" (7123, 2.0),\n",
" (2360375, 1.0),\n",
" (846902, 2.0),\n",
" (1426631, 2.0),\n",
" (2364606, 1.0),\n",
" (56567, 2.0),\n",
" (2362827, 1.0),\n",
" (3774, 2.0),\n",
" (1640013, 1.0),\n",
" (2362743, 1.0),\n",
" (1373633, 1.0),\n",
" (2359834, 2.0),\n",
" (507624, 3.0),\n",
" (221550, 1.0),\n",
" (603246, 1.0),\n",
" (495367, 1.0),\n",
" (2361515, 2.0),\n",
" (2359822, 1.0),\n",
" (1737286, 3.0),\n",
" (2364808, 1.0),\n",
" (2365725, 1.0),\n",
" (2361772, 1.0),\n",
" (1651902, 1.0),\n",
" (2363306, 1.0),\n",
" (619, 1.0),\n",
" (1629163, 1.0),\n",
" (1504097, 1.0),\n",
" (2362986, 1.0),\n",
" (2364864, 1.0),\n",
" (2360673, 1.0),\n",
" (2362113, 1.0),\n",
" (2359830, 1.0),\n",
" (2361568, 1.0),\n",
" (2364434, 1.0),\n",
" (1458249, 7.0),\n",
" (2360311, 1.0),\n",
" (529246, 1.0),\n",
" (1488668, 1.0),\n",
" (2363642, 1.0),\n",
" (2360653, 1.0),\n",
" (1559068, 1.0),\n",
" (2365321, 1.0),\n",
" (1457684, 1.0),\n",
" (438646, 1.0),\n",
" (2365810, 1.0),\n",
" (2365732, 1.0),\n",
" (1412614, 1.0),\n",
" (2359828, 1.0),\n",
" (2361086, 1.0),\n",
" (481165, 1.0),\n",
" (1415000, 4.0),\n",
" (2361620, 1.0),\n",
" (1519582, 1.0),\n",
" (495913, 1.0),\n",
" (571277, 1.0),\n",
" (929616, 1.0),\n",
" (1496975, 1.0),\n",
" (2364259, 1.0),\n",
" (720411, 1.0),\n",
" (590431, 1.0),\n",
" (2360442, 1.0),\n",
" (10332, 16.0),\n",
" (229, 6.0),\n",
" (2364741, 1.0),\n",
" (2362709, 1.0),\n",
" (2364303, 1.0),\n",
" (849430, 1.0),\n",
" (2282498, 1.0),\n",
" (2359863, 1.0),\n",
" (2364492, 1.0),\n",
" (2362132, 1.0),\n",
" (2361029, 1.0),\n",
" (2360359, 1.0),\n",
" (2365821, 1.0),\n",
" (2361837, 1.0),\n",
" (2364649, 1.0),\n",
" (477731, 1.0),\n",
" (2365708, 1.0),\n",
" (520153, 1.0),\n",
" (721226, 1.0),\n",
" (1507049, 1.0),\n",
" (2359250, 1.0),\n",
" (1444, 2.0),\n",
" (2359380, 2.0),\n",
" (2358611, 1.0),\n",
" (2365631, 1.0),\n",
" (2358674, 1.0),\n",
" (498799, 2.0),\n",
" (518187, 2.0),\n",
" (1882294, 1.0),\n",
" (2364641, 1.0),\n",
" (2364180, 1.0),\n",
" (2358754, 1.0),\n",
" (22225, 1.0),\n",
" (1605044, 1.0),\n",
" (2365651, 1.0),\n",
" (1778186, 2.0),\n",
" (561922, 2.0),\n",
" (17401, 5.0),\n",
" (136897, 1.0),\n",
" (2365808, 1.0),\n",
" (2360158, 2.0),\n",
" (2361616, 1.0),\n",
" (2362954, 1.0),\n",
" (2364321, 1.0),\n",
" (2362764, 1.0),\n",
" (2361022, 1.0),\n",
" (2361951, 1.0),\n",
" (582950, 1.0),\n",
" (589092, 5.0),\n",
" (2362133, 1.0),\n",
" (2363691, 2.0),\n",
" (2364517, 1.0),\n",
" (60812, 4.0),\n",
" (2360940, 1.0),\n",
" (7581, 1.0),\n",
" (2364208, 1.0),\n",
" (2363030, 2.0),\n",
" (2360667, 1.0),\n",
" (16074, 3.0),\n",
" (2359460, 2.0),\n",
" (1212403, 1.0),\n",
" (2361133, 1.0),\n",
" (1307614, 1.0),\n",
" (2363300, 1.0),\n",
" (676195, 2.0),\n",
" (1386896, 1.0),\n",
" (2362905, 1.0),\n",
" (460493, 1.0),\n",
" (1754392, 1.0),\n",
" (2365403, 2.0),\n",
" (2361743, 1.0),\n",
" (1536985, 1.0),\n",
" (2359239, 1.0),\n",
" (2362454, 1.0),\n",
" (2364031, 1.0),\n",
" (2364967, 1.0),\n",
" (2363483, 1.0),\n",
" (531152, 1.0),\n",
" (628079, 1.0),\n",
" (2364775, 1.0),\n",
" (2360912, 1.0),\n",
" (2362164, 1.0),\n",
" (2361361, 1.0),\n",
" (2364337, 1.0),\n",
" (2360479, 1.0),\n",
" (1636750, 1.0),\n",
" (2362756, 1.0),\n",
" (6776, 28.0),\n",
" (2359728, 1.0),\n",
" (1509353, 1.0),\n",
" (2363718, 2.0),\n",
" (2360247, 1.0),\n",
" (14320, 3.0),\n",
" (2362270, 1.0),\n",
" (2358695, 2.0),\n",
" (2364486, 1.0),\n",
" (622987, 1.0),\n",
" (2359037, 3.0),\n",
" (2365803, 2.0),\n",
" (2360945, 1.0),\n",
" (670095, 2.0),\n",
" (1868827, 1.0),\n",
" (854430, 1.0),\n",
" (886740, 1.0),\n",
" (2363007, 1.0),\n",
" (2365356, 1.0),\n",
" (2361581, 1.0),\n",
" (1891808, 1.0),\n",
" (2364560, 1.0),\n",
" (2358659, 2.0),\n",
" (2361679, 1.0),\n",
" (1399977, 1.0),\n",
" (2362470, 1.0),\n",
" (2362535, 1.0),\n",
" (7889, 3.0),\n",
" (2360679, 1.0),\n",
" (509662, 2.0),\n",
" (2362156, 1.0),\n",
" (2364667, 1.0),\n",
" (2362033, 1.0),\n",
" (2362283, 1.0),\n",
" (2364063, 1.0),\n",
" (2361375, 1.0),\n",
" (1475626, 1.0),\n",
" (1521047, 2.0),\n",
" (511427, 1.0),\n",
" (111326, 1.0),\n",
" (2360707, 1.0),\n",
" (505637, 2.0),\n",
" (1488943, 3.0),\n",
" (2359877, 1.0),\n",
" (2360900, 1.0),\n",
" (18972, 1.0),\n",
" (16443, 13.0),\n",
" (2363138, 1.0),\n",
" (2365566, 1.0),\n",
" (2362384, 1.0),\n",
" (2360423, 1.0),\n",
" (1493576, 2.0),\n",
" (514804, 3.0),\n",
" (2364588, 1.0),\n",
" (2363799, 1.0),\n",
" (921794, 1.0),\n",
" (453389, 1.0),\n",
" (60324, 2.0),\n",
" (2358664, 2.0),\n",
" (2365322, 1.0),\n",
" (2364109, 1.0),\n",
" (2361234, 4.0),\n",
" (4098, 1.0),\n",
" (2362380, 1.0),\n",
" (20, 38.0),\n",
" (16087, 8.0),\n",
" (1424352, 1.0),\n",
" (1651793, 4.0),\n",
" (483093, 1.0),\n",
" (497620, 1.0),\n",
" (545734, 1.0),\n",
" (44231, 1.0),\n",
" (2364363, 1.0),\n",
" (2360246, 2.0),\n",
" (2362424, 1.0),\n",
" (587390, 1.0),\n",
" (2363417, 1.0),\n",
" (2362446, 1.0),\n",
" (2364057, 1.0),\n",
" (1533417, 1.0),\n",
" (2362937, 1.0),\n",
" (514879, 9.0),\n",
" (851674, 1.0),\n",
" (2362473, 1.0),\n",
" (116746, 1.0),\n",
" (2358853, 1.0),\n",
" (495626, 1.0),\n",
" (4606, 9.0),\n",
" (504650, 3.0),\n",
" (2358915, 1.0),\n",
" (2361056, 1.0),\n",
" (2414, 1.0),\n",
" (2359008, 1.0),\n",
" (920101, 1.0),\n",
" (1468842, 1.0),\n",
" (2363359, 1.0),\n",
" (507170, 31.0),\n",
" (2358637, 1.0),\n",
" (84521, 1.0),\n",
" (2359052, 1.0),\n",
" (2360379, 1.0),\n",
" (2363611, 1.0),\n",
" (1919361, 2.0),\n",
" (732322, 1.0),\n",
" (501422, 1.0),\n",
" (2365077, 2.0),\n",
" (2360409, 1.0),\n",
" (2362312, 1.0),\n",
" (1434485, 1.0),\n",
" (522828, 1.0),\n",
" (2364297, 1.0),\n",
" (1397378, 1.0),\n",
" (2364626, 1.0),\n",
" (2363767, 1.0),\n",
" (459137, 7.0),\n",
" (1523910, 1.0),\n",
" (1623256, 1.0),\n",
" (2365326, 1.0),\n",
" (2360470, 1.0),\n",
" (2363199, 1.0),\n",
" (2363020, 1.0),\n",
" (2365890, 1.0),\n",
" (2363754, 1.0),\n",
" (2365492, 1.0),\n",
" (916266, 1.0),\n",
" (239869, 5.0),\n",
" (2363263, 1.0),\n",
" (14041, 1.0),\n",
" (2359252, 1.0),\n",
" (2362123, 1.0),\n",
" (2358623, 1.0),\n",
" (1400803, 1.0),\n",
" (2363090, 1.0),\n",
" (2363484, 1.0),\n",
" (589395, 1.0),\n",
" (2362003, 1.0),\n",
" (2359407, 1.0),\n",
" (1406864, 1.0),\n",
" (2362701, 1.0),\n",
" (882, 1.0),\n",
" (2362047, 1.0),\n",
" (2365866, 1.0),\n",
" (2365640, 1.0),\n",
" (2365581, 1.0),\n",
" (616044, 2.0),\n",
" (791952, 1.0),\n",
" (2364175, 1.0),\n",
" (692211, 2.0),\n",
" (12131, 2.0),\n",
" (2359642, 3.0),\n",
" (1414828, 1.0),\n",
" ...]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from gargantext.util.toolchain.metric_tfidf import compute_occs\n",
"\n",
"corpus= session.query(CorpusNode).get(corpus_id)\n",
"\n",
"occ_id = session.query(OccurrencesNode.id).filter(OccurrencesNode.parent_id == corpus_id).first()\n",
"group_id = session.query(GrouplistNode.id).filter(GrouplistNode.parent_id == corpus_id).first()\n",
"Occurrences = aliased(NodeNodeNgram)\n",
"MapTerms = aliased(NodeNgram)\n",
"Documents = aliased(DocumentNode)\n",
"\n",
"compute_occs(corpus, interactiv=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"mapTermsOcc = (session.query(Occurrences).join( MapTerms, MapTerms.ngram_id == Occurrences.ngram_id)\n",
" .filter(MapTerms.node_id == map_id)\n",
" \n",
" .join(Documents, Documents.id == Occurrences.node2_id)\n",
" .filter(Documents.parent_id == corpus_id)\n",
" \n",
" .filter(Occurrences.node1_id == occ_id)\n",
" \n",
" #.group_by(Occurrences.ngram_id)\n",
" .all()\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(303698)"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"group_id"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mapTermsOcc"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cooccurrences of MapTerms by Year"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from graph.cooccurrences import countCooccurrences"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
" (cooc_id, cooc_matrix) = countCooccurrences( corpus_id=corpus_id, cooc_id= \n",
" , field1=field1, field2=field2 \n",
" , start=start , end =end \n",
" , mapList_id=mapList_id , groupList_id=groupList_id \n",
" , isMonopartite=True , threshold = threshold \n",
" , distance=distance , bridgeness=bridgeness \n",
" , save_on_db = True , reset = reset \n",
" ) "
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GRAPH #303869 Filtering the matrix with Map and Group Lists.\n",
"WeightedMatrix bulk_insert start\n",
"WeightedMatrix bulk_insert stop\n",
"GRAPH #303869 ... Node Cooccurrence Matrix saved\n",
"GRAPH #303869 ... Parameters saved in Node.\n"
]
}
],
"source": [
"#countCooccurrences(corpus_id, save_on_db=False, start=\"2000-01-01\", end=\"2017-12-31\")\n",
"(cooc_id, cooc_matrix) = countCooccurrences( corpus_id = corpus_id\n",
" , cooc_id = None\n",
" , field1=\"ngrams\", field2 = \"ngrams\"\n",
" \n",
" , mapList_id = map_id\n",
" , groupList_id = group_id\n",
" \n",
" , isMonopartite =True , threshold = 2 \n",
" #, distance =Non , bridgeness=bridgeness\n",
" \n",
" , save_on_db = True\n",
" , reset = True\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(float, {})"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cooc_matrix.items"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Number of Documents per year"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Date DateValue\n",
"Date \n",
"1954 1954 2\n",
"1956 1956 1\n",
"1957 1957 1\n",
"1958 1958 5\n",
"1960 1960 3\n",
"1961 1961 5\n",
"1962 1962 2\n",
"1963 1963 11\n",
"1964 1964 5\n",
"1965 1965 3\n"
]
}
],
"source": [
"# To get all the documents:\n",
"docs = documents(corpus_id)\n",
"# If I want to count:\n",
"myChart = chart(docs, \"publication_year\")\n",
"print(myChart[:10])"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"source": [
"# Others example"
]
},
{
"cell_type": "code",
"execution_count": 3,
......@@ -54,11 +1440,11 @@
"name": "stdout",
"output_type": "stream",
"text": [
"109\n",
"LSTM 10000000\n",
"139\n",
"LSTM 1000\n",
"Downloading page 0 to 100 results\n",
"Downloading page 100 to 100 results\n",
"CORPUS #302558\n",
"CORPUS #303703\n",
"PARSING\n",
"Loading available PARSERS:\n",
"\t- EuropresseParser\n",
......@@ -74,42 +1460,45 @@
"\t- HalParser\n",
"\t- IsidoreParser\n",
"0 docs skipped\n",
"109 parsed\n",
"139 parsed\n",
"#MAIN language of the CORPUS __unknown__\n",
"CORPUS #302558: parsed 109\n",
"CORPUS #303703: parsed 139\n",
"#TAGGERS LOADED: {'__unknown__': <gargantext.util.taggers.NltkTagger.NltkTagger object at 0x7f03064496a0>}\n",
"#SUPPORTED TAGGER LANGS ['__unknown__']\n",
"INTEGRATE\n",
"INTEGRATE\n",
"INTEGRATE\n",
"CORPUS #302558: extracted ngrams\n",
"CORPUS #302558: indexed hyperdata\n",
"CORPUS #302558: [2017-09-14_17:00:50] new favorites node #302668\n",
"CORPUS #302558: [2017-09-14_17:00:50] starting ngram lists computation\n",
"CORPUS #302558: [2017-09-14_17:00:50] new stoplist node #302669\n",
"# STEMMERS LOADED {'__unknown__': <nltk.stem.snowball.SnowballStemmer object at 0x7f28053c00b8>}\n",
"CORPUS #303703: extracted ngrams\n",
"CORPUS #303703: indexed hyperdata\n",
"CORPUS #303703: [2017-10-10_09:34:23] new favorites node #303843\n",
"CORPUS #303703: [2017-10-10_09:34:23] starting ngram lists computation\n",
"CORPUS #303703: [2017-10-10_09:34:24] new stoplist node #303844\n",
"# STEMMERS LOADED {'__unknown__': <nltk.stem.snowball.SnowballStemmer object at 0x7f0306497cf8>}\n",
"#SUPPORTED STEMMERS LANGS []\n",
"CORPUS #302558: [2017-09-14_17:00:51] new grouplist node #302670\n",
"CORPUS #302558: [2017-09-14_17:00:51] new occs node #302671\n",
"CORPUS #303703: [2017-10-10_09:34:25] new grouplist node #303845\n",
"CORPUS #303703: [2017-10-10_09:34:25] new occs node #303846\n",
"compute_ti_ranking\n",
"2017-09-14_17:00:51 : Starting Query tf_nd_query\n",
"2017-09-14_17:00:51 : End Query tf_nd_quer\n",
"2017-09-14_17:00:51 : tfidfsum\n",
"CORPUS #302558: [2017-09-14_17:00:52] new ti ranking node #302672\n",
"MAINLIST: keeping 2588 ngrams out of 3451\n",
"CORPUS #302558: [2017-09-14_17:00:52] new mainlist node #302673\n",
"2017-10-10_09:34:25 : Starting Query tf_nd_query\n",
"2017-10-10_09:34:26 : End Query tf_nd_quer\n",
"2017-10-10_09:34:26 : tfidfsum\n",
"CORPUS #303703: [2017-10-10_09:34:26] new ti ranking node #303847\n",
"MAINLIST: keeping 3295 ngrams out of 4393\n",
"CORPUS #303703: [2017-10-10_09:34:26] new mainlist node #303848\n",
"Compute TFIDF local\n",
"CORPUS #302558: [2017-09-14_17:00:52] new localtfidf node #302674\n",
"COOCS: NEW matrix shape [164x212]\n",
"CORPUS #302558: [2017-09-14_17:00:55] computed mainlist coocs for specif rank\n",
"SPECIFICITY: computing on 157 ngrams\n",
"CORPUS #302558: [2017-09-14_17:00:55] new spec-clusion node #302675\n",
"CORPUS #302558: [2017-09-14_17:00:55] new gen-clusion node #302676\n",
"CORPUS #303703: [2017-10-10_09:34:26] new localtfidf node #303849\n",
"COOCS: NEW matrix shape [215x361]\n",
"CORPUS #303703: [2017-10-10_09:34:32] computed mainlist coocs for specif rank\n",
"SPECIFICITY: computing on 209 ngrams\n",
"CORPUS #303703: [2017-10-10_09:34:32] new spec-clusion node #303853\n",
"CORPUS #303703: [2017-10-10_09:34:32] new gen-clusion node #303854\n",
"MAPLIST quotas: {'topgen': {'multigrams': 168, 'monograms': 42}, 'topspec': {'multigrams': 112, 'monograms': 28}}\n",
"MAPLIST: top_spec_monograms = 28\n",
"MAPLIST: top_spec_multigrams = 41\n",
"MAPLIST: top_spec_multigrams = 55\n",
"MAPLIST: top_gen_monograms = 42\n",
"MAPLIST: top_gen_multigrams = 0\n",
"MAPLIST: kept 111 ngrams in total \n",
"CORPUS #302558: [2017-09-14_17:00:55] new maplist node #302677\n",
"CORPUS #302558: [2017-09-14_17:00:55] FINISHED ngram lists computation\n"
"MAPLIST: kept 125 ngrams in total \n",
"CORPUS #303703: [2017-10-10_09:34:32] new maplist node #303855\n",
"CORPUS #303703: [2017-10-10_09:34:32] FINISHED ngram lists computation\n"
]
}
],
......@@ -146,7 +1535,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {
"collapsed": false,
"deletable": true,
......@@ -156,62 +1545,42 @@
{
"data": {
"text/plain": [
"6"
"0"
]
},
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# search full text (english by default) and DELETE in the corpus\n",
"scan_gargantext_and_delete(corpus.id, \"machine | learning & deep\")"
"# search full text (english by default) in the corpus\n",
"scan_gargantext(corpus.id, \"machine | learning & deep\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"0"
"6"
]
},
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# search full text (english by default) in the corpus\n",
"scan_gargantext(corpus.id, \"machine | learning & deep\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"source": [
"# Others example"
"# search full text (english by default) and DELETE in the corpus\n",
"scan_gargantext_and_delete(corpus.id, \"machine | learning & deep\")"
]
},
{
......
......@@ -41,6 +41,512 @@
"%matplotlib inline "
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"source": [
"# Philomemies"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Instantiate the corpus you are working on"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"My corpus id is : 302695.\n"
]
}
],
"source": [
"corpus_url = \"http://localhost:8000/projects/302694/corpora/302695/\"\n",
"corpus_id = corpus_url.split(\"/\")[6]\n",
"print(\"My corpus id is : %s.\" % corpus_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Getting the Map Terms "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(21, 'environment'), (42, 'development'), (184, 'examples'), (196, 'water'), (368, 'problem'), (576, 'work'), (654, 'technology'), (712, 'number'), (738, 'operation'), (817, 'experiments')]\n"
]
}
],
"source": [
"from gargantext.models import *\n",
"import csv\n",
"\n",
"map_id = session.query(MaplistNode.id).filter(MaplistNode.parent_id == corpus_id).first()\n",
"\n",
"mapTerms = (session.query(Ngram).join( NodeNgram, NodeNgram.ngram_id == Ngram.id)\n",
" .filter(NodeNgram.node_id == map_id)\n",
" .all()\n",
" )\n",
"\n",
"print([(m.id, m.terms) for m in mapTerms[:10]])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save in CSV File"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"toPrint = [(m.id,m.terms) for m in mapTerms]\n",
"csvfile = \"./MapTerms.csv\"\n",
"\n",
"#Assuming res is a flat list\n",
"with open(csvfile, \"w\") as output:\n",
" writer = csv.writer(output, lineterminator='\\n')\n",
" for val in toPrint:\n",
" writer.writerow([val])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Next:\n",
"# You can have access to your CSV file in the home of you Notebook!\n",
"# Click, rename, mv, delete in your Notebook\n",
"\n",
"#Assuming output is a list of lists\n",
"#with open(csvfile, \"w\") as output:\n",
"# writer = csv.writer(output, lineterminator='\\n')\n",
"# writer.writerows(res)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Occurrences of MapTerms by Year"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from gargantext.util.toolchain.metric_tfidf import compute_occs\n",
"\n",
"corpus= session.query(CorpusNode).get(corpus_id)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'abstract': 'The purpose of this paper is to develop a new fuzzy dynamic programming approach for solving hybrid multiobjective multistage decision-making problems. We first present a methodology of fuzzy evaluation and fuzzy optimization for hybrid multiobjective systems, in which the qualitative and quantitative objectives are synthetically considered. The qualitative objectives are evaluated by decision-makers with linguistic variables and the quantitative objectives are converted into proper dimensionless indices. After getting the marginal evaluations for each objective, a new aggregation method based on the principle of fuzzy pattern recognition is developed to get a global evaluation for all objectives. With the global evaluation obtained, a fuzzy optimization process is performed. Then we present a dynamic optimization algorithm by incorporating the fuzzy optimization process with the conventional dynamic programming technique to solve hybrid multiobjective multistage decision-making problems. A characteristic feature of the approach proposed is that various objectives are synthetically considered by the fuzzy systematic technique instead of the frequently employed weighted average method. Finally, an illustrative example is also given to clarify the developed approach and to demonstrate its effectiveness.',\n",
" 'authors': 'Lushu Li, K.K. Lai',\n",
" 'authorsRAW': [{'affiliations': ['Faculty of Administration, University of New Brunswick, Fredericton, N.B., Canada',\n",
" 'Corresponding author'],\n",
" 'name': 'Lushu Li'},\n",
" {'affiliations': ['Department of Management Science, City University of Hong Kong, Tat Chee Avenue, Kowloon, Hong Kong'],\n",
" 'name': 'K.K. Lai'}],\n",
" 'doi': '10.1016/S0165-0114(98)00423-0',\n",
" 'genre': ['research-article'],\n",
" 'id': '5E6CB638271D0121DB653AB9150D2F025346816A',\n",
" 'language_iso2': 'en',\n",
" 'language_iso3': 'eng',\n",
" 'language_name': 'English',\n",
" 'publication_date': '2001-01-01 00:00:00+00:00',\n",
" 'publication_day': 1,\n",
" 'publication_hour': 0,\n",
" 'publication_minute': 0,\n",
" 'publication_month': 1,\n",
" 'publication_second': 0,\n",
" 'publication_year': 2001,\n",
" 'source': 'Fuzzy Sets and Systems',\n",
" 'statuses': [],\n",
" 'title': 'Fuzzy dynamic programming approach to hybrid multiobjective multistage decision-making problems'}"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To get all the documents:\n",
"docs = documents(corpus_id)\n",
"docs[0].hyperdata"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(1954, 2), (1956, 1), (1957, 1), (1958, 5), (1960, 3), (1961, 5), (1962, 2), (1963, 11), (1964, 5), (1965, 3), (1966, 1), (1967, 8), (1968, 17), (1969, 10), (1970, 8), (1971, 20), (1972, 12), (1973, 20), (1974, 16), (1975, 17), (1976, 8), (1977, 10), (1978, 14), (1979, 16), (1980, 28), (1981, 12), (1982, 14), (1983, 15), (1984, 19), (1985, 22), (1986, 27), (1987, 28), (1988, 24), (1989, 20), (1990, 26), (1991, 54), (1992, 48), (1993, 40), (1994, 40), (1995, 28), (1996, 32), (1997, 34), (1998, 30), (1999, 25), (2000, 37), (2001, 29), (2002, 13), (2003, 19), (2004, 17), (2005, 21), (2006, 17), (2007, 11), (2008, 10), (2009, 8), (2010, 9), (2011, 9), (2012, 12), (2013, 7)]\n"
]
}
],
"source": [
"pubsByYear = countByField(docs, \"publication_year\")\n",
"\n",
"print(pubsByYear)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1954, 1956, 1957, 1958, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013]\n"
]
}
],
"source": [
"years = [y for y in map(lambda x: x[0], pubsByYear)]\n",
"print(years)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# To Add the groups you need to get the Node\n",
"group_id = session.query(GrouplistNode.id).filter(GrouplistNode.parent_id == corpus_id).first()\n",
"\n",
"occByYear = list()\n",
"\n",
"# Not optmized yet since sql request is launched for each year\n",
"# We will use a group by if needed, depends on the size of corpus\n",
"# Clarity of the computation is first done here\n",
"# Optmization will be the step After\n",
"for year in years:\n",
" listNgramOcc = compute_occs(corpus, groupings_id=group_id, year=year, interactiv=True)\n",
" listYearNgramOcc = [(year, ngram_id, occ) for (ngram_id, occ) in listNgramOcc]\n",
" occByYear.append(listYearNgramOcc)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[(1954, 5249, 1.0), (1954, 5366, 1.0), (1954, 7019, 1.0), (1954, 10524, 1.0), (1954, 121362, 1.0), (1954, 505775, 1.0)], [(1956, 7019, 1.0), (1956, 8604, 1.0), (1956, 755610, 1.0), (1956, 2361839, 1.0)]]\n"
]
}
],
"source": [
"\n",
"# Saving the results in file\n",
"toPrint = [(m.id,m.terms) for m in mapTerms]\n",
"csvfile = \"./MapTerms.csv\"\n",
"\n",
"#Assuming res is a flat list\n",
"with open(csvfile, \"w\") as output:\n",
" writer = csv.writer(output, lineterminator='\\n')\n",
" for val in toPrint:\n",
" writer.writerow([val])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"mapTermsOcc = (session.query(Occurrences).join( MapTerms, MapTerms.ngram_id == Occurrences.ngram_id)\n",
" .filter(MapTerms.node_id == map_id)\n",
" \n",
" .join(Documents, Documents.id == Occurrences.node2_id)\n",
" .filter(Documents.parent_id == corpus_id)\n",
" \n",
" .filter(Occurrences.node1_id == occ_id)\n",
" \n",
" #.group_by(Occurrences.ngram_id)\n",
" .all()\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(303698)"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"group_id"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mapTermsOcc"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cooccurrences of MapTerms by Year"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from graph.cooccurrences import countCooccurrences"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
" (cooc_id, cooc_matrix) = countCooccurrences( corpus_id=corpus_id, cooc_id= \n",
" , field1=field1, field2=field2 \n",
" , start=start , end =end \n",
" , mapList_id=mapList_id , groupList_id=groupList_id \n",
" , isMonopartite=True , threshold = threshold \n",
" , distance=distance , bridgeness=bridgeness \n",
" , save_on_db = True , reset = reset \n",
" ) "
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GRAPH #303869 Filtering the matrix with Map and Group Lists.\n",
"WeightedMatrix bulk_insert start\n",
"WeightedMatrix bulk_insert stop\n",
"GRAPH #303869 ... Node Cooccurrence Matrix saved\n",
"GRAPH #303869 ... Parameters saved in Node.\n"
]
}
],
"source": [
"#countCooccurrences(corpus_id, save_on_db=False, start=\"2000-01-01\", end=\"2017-12-31\")\n",
"(cooc_id, cooc_matrix) = countCooccurrences( corpus_id = corpus_id\n",
" , cooc_id = None\n",
" , field1=\"ngrams\", field2 = \"ngrams\"\n",
" \n",
" , mapList_id = map_id\n",
" , groupList_id = group_id\n",
" \n",
" , isMonopartite =True , threshold = 2 \n",
" #, distance =Non , bridgeness=bridgeness\n",
" \n",
" , save_on_db = True\n",
" , reset = True\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(float, {})"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cooc_matrix.items"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Number of Documents per year"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Date DateValue\n",
"Date \n",
"1954 1954 2\n",
"1956 1956 1\n",
"1957 1957 1\n",
"1958 1958 5\n",
"1960 1960 3\n",
"1961 1961 5\n",
"1962 1962 2\n",
"1963 1963 11\n",
"1964 1964 5\n",
"1965 1965 3\n"
]
}
],
"source": [
"# To get all the documents:\n",
"docs = documents(corpus_id)\n",
"# If I want to count:\n",
"myChart = chart(docs, \"publication_year\")\n",
"print(myChart[:10])"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"source": [
"# Others example"
]
},
{
"cell_type": "code",
"execution_count": 3,
......@@ -54,11 +560,11 @@
"name": "stdout",
"output_type": "stream",
"text": [
"109\n",
"LSTM 10000000\n",
"139\n",
"LSTM 1000\n",
"Downloading page 0 to 100 results\n",
"Downloading page 100 to 100 results\n",
"CORPUS #302558\n",
"CORPUS #303703\n",
"PARSING\n",
"Loading available PARSERS:\n",
"\t- EuropresseParser\n",
......@@ -74,42 +580,45 @@
"\t- HalParser\n",
"\t- IsidoreParser\n",
"0 docs skipped\n",
"109 parsed\n",
"139 parsed\n",
"#MAIN language of the CORPUS __unknown__\n",
"CORPUS #302558: parsed 109\n",
"CORPUS #303703: parsed 139\n",
"#TAGGERS LOADED: {'__unknown__': <gargantext.util.taggers.NltkTagger.NltkTagger object at 0x7f03064496a0>}\n",
"#SUPPORTED TAGGER LANGS ['__unknown__']\n",
"INTEGRATE\n",
"INTEGRATE\n",
"INTEGRATE\n",
"CORPUS #302558: extracted ngrams\n",
"CORPUS #302558: indexed hyperdata\n",
"CORPUS #302558: [2017-09-14_17:00:50] new favorites node #302668\n",
"CORPUS #302558: [2017-09-14_17:00:50] starting ngram lists computation\n",
"CORPUS #302558: [2017-09-14_17:00:50] new stoplist node #302669\n",
"# STEMMERS LOADED {'__unknown__': <nltk.stem.snowball.SnowballStemmer object at 0x7f28053c00b8>}\n",
"CORPUS #303703: extracted ngrams\n",
"CORPUS #303703: indexed hyperdata\n",
"CORPUS #303703: [2017-10-10_09:34:23] new favorites node #303843\n",
"CORPUS #303703: [2017-10-10_09:34:23] starting ngram lists computation\n",
"CORPUS #303703: [2017-10-10_09:34:24] new stoplist node #303844\n",
"# STEMMERS LOADED {'__unknown__': <nltk.stem.snowball.SnowballStemmer object at 0x7f0306497cf8>}\n",
"#SUPPORTED STEMMERS LANGS []\n",
"CORPUS #302558: [2017-09-14_17:00:51] new grouplist node #302670\n",
"CORPUS #302558: [2017-09-14_17:00:51] new occs node #302671\n",
"CORPUS #303703: [2017-10-10_09:34:25] new grouplist node #303845\n",
"CORPUS #303703: [2017-10-10_09:34:25] new occs node #303846\n",
"compute_ti_ranking\n",
"2017-09-14_17:00:51 : Starting Query tf_nd_query\n",
"2017-09-14_17:00:51 : End Query tf_nd_quer\n",
"2017-09-14_17:00:51 : tfidfsum\n",
"CORPUS #302558: [2017-09-14_17:00:52] new ti ranking node #302672\n",
"MAINLIST: keeping 2588 ngrams out of 3451\n",
"CORPUS #302558: [2017-09-14_17:00:52] new mainlist node #302673\n",
"2017-10-10_09:34:25 : Starting Query tf_nd_query\n",
"2017-10-10_09:34:26 : End Query tf_nd_quer\n",
"2017-10-10_09:34:26 : tfidfsum\n",
"CORPUS #303703: [2017-10-10_09:34:26] new ti ranking node #303847\n",
"MAINLIST: keeping 3295 ngrams out of 4393\n",
"CORPUS #303703: [2017-10-10_09:34:26] new mainlist node #303848\n",
"Compute TFIDF local\n",
"CORPUS #302558: [2017-09-14_17:00:52] new localtfidf node #302674\n",
"COOCS: NEW matrix shape [164x212]\n",
"CORPUS #302558: [2017-09-14_17:00:55] computed mainlist coocs for specif rank\n",
"SPECIFICITY: computing on 157 ngrams\n",
"CORPUS #302558: [2017-09-14_17:00:55] new spec-clusion node #302675\n",
"CORPUS #302558: [2017-09-14_17:00:55] new gen-clusion node #302676\n",
"CORPUS #303703: [2017-10-10_09:34:26] new localtfidf node #303849\n",
"COOCS: NEW matrix shape [215x361]\n",
"CORPUS #303703: [2017-10-10_09:34:32] computed mainlist coocs for specif rank\n",
"SPECIFICITY: computing on 209 ngrams\n",
"CORPUS #303703: [2017-10-10_09:34:32] new spec-clusion node #303853\n",
"CORPUS #303703: [2017-10-10_09:34:32] new gen-clusion node #303854\n",
"MAPLIST quotas: {'topgen': {'multigrams': 168, 'monograms': 42}, 'topspec': {'multigrams': 112, 'monograms': 28}}\n",
"MAPLIST: top_spec_monograms = 28\n",
"MAPLIST: top_spec_multigrams = 41\n",
"MAPLIST: top_spec_multigrams = 55\n",
"MAPLIST: top_gen_monograms = 42\n",
"MAPLIST: top_gen_multigrams = 0\n",
"MAPLIST: kept 111 ngrams in total \n",
"CORPUS #302558: [2017-09-14_17:00:55] new maplist node #302677\n",
"CORPUS #302558: [2017-09-14_17:00:55] FINISHED ngram lists computation\n"
"MAPLIST: kept 125 ngrams in total \n",
"CORPUS #303703: [2017-10-10_09:34:32] new maplist node #303855\n",
"CORPUS #303703: [2017-10-10_09:34:32] FINISHED ngram lists computation\n"
]
}
],
......@@ -146,7 +655,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {
"collapsed": false,
"deletable": true,
......@@ -156,62 +665,42 @@
{
"data": {
"text/plain": [
"6"
"0"
]
},
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# search full text (english by default) and DELETE in the corpus\n",
"scan_gargantext_and_delete(corpus.id, \"machine | learning & deep\")"
"# search full text (english by default) in the corpus\n",
"scan_gargantext(corpus.id, \"machine | learning & deep\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"0"
"6"
]
},
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# search full text (english by default) in the corpus\n",
"scan_gargantext(corpus.id, \"machine | learning & deep\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"source": [
"# Others example"
"# search full text (english by default) and DELETE in the corpus\n",
"scan_gargantext_and_delete(corpus.id, \"machine | learning & deep\")"
]
},
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment