Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
9ab3433d
Commit
9ab3433d
authored
Mar 22, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
new args 'start' and 'end' in toolchain.ngram_coocs (+ fixing NodeHyperdata slightly)
parent
5d3417fc
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
69 additions
and
10 deletions
+69
-10
constants.py
gargantext/constants.py
+2
-0
hyperdata.py
gargantext/models/hyperdata.py
+9
-7
_Parser.py
gargantext/util/parsers/_Parser.py
+1
-0
hyperdata_indexing.py
gargantext/util/toolchain/hyperdata_indexing.py
+12
-1
ngram_coocs.py
gargantext/util/toolchain/ngram_coocs.py
+45
-2
No files found.
gargantext/constants.py
View file @
9ab3433d
...
@@ -37,6 +37,7 @@ NODETYPES = [
...
@@ -37,6 +37,7 @@ NODETYPES = [
'TFIDF-GLOBAL'
,
# 14
'TFIDF-GLOBAL'
,
# 14
]
]
# TODO find somewhere else than constants.py for function
import
datetime
import
datetime
import
dateutil
import
dateutil
def
convert_to_date
(
date
):
def
convert_to_date
(
date
):
...
@@ -46,6 +47,7 @@ def convert_to_date(date):
...
@@ -46,6 +47,7 @@ def convert_to_date(date):
return
dateutil
.
parser
.
parse
(
date
)
return
dateutil
.
parser
.
parse
(
date
)
INDEXED_HYPERDATA
=
{
INDEXED_HYPERDATA
=
{
# TODO use properties during toolchain.hyperdata_indexing (type, convert_to_db, convert_from_db)
'publication_date'
:
'publication_date'
:
{
'id'
:
1
,
'type'
:
datetime
.
datetime
,
'convert_to_db'
:
convert_to_date
,
'convert_from_db'
:
datetime
.
datetime
.
fromtimestamp
},
{
'id'
:
1
,
'type'
:
datetime
.
datetime
,
'convert_to_db'
:
convert_to_date
,
'convert_from_db'
:
datetime
.
datetime
.
fromtimestamp
},
'title'
:
'title'
:
...
...
gargantext/models/hyperdata.py
View file @
9ab3433d
...
@@ -25,20 +25,20 @@ class HyperdataValueComparer(object):
...
@@ -25,20 +25,20 @@ class HyperdataValueComparer(object):
class
HyperdataKey
(
TypeDecorator
):
class
HyperdataKey
(
TypeDecorator
):
"""Define a new type of column to describe a
Node
's type.
"""Define a new type of column to describe a
Hyperdata field
's type.
Internally, this column type is implemented as an SQL integer.
Internally, this column type is implemented as an SQL integer.
Values are detailed in `gargantext.constants.
NODETYPES
`.
Values are detailed in `gargantext.constants.
INDEXED_HYPERDATA
`.
"""
"""
impl
=
Integer
impl
=
Integer
def
process_bind_param
(
self
,
keyname
,
dialect
):
def
process_bind_param
(
self
,
keyname
,
dialect
):
if
keyname
in
INDEXED_HYPERDATA
:
if
keyname
in
INDEXED_HYPERDATA
:
return
INDEXED_HYPERDATA
[
keyname
]
return
INDEXED_HYPERDATA
[
keyname
]
[
'id'
]
raise
ValueError
(
'Hyperdata key "
%
s" was not found in `gargantext.constants.
NODETYPES
`'
%
keyname
)
raise
ValueError
(
'Hyperdata key "
%
s" was not found in `gargantext.constants.
INDEXED_HYPERDATA
`'
%
keyname
)
def
process_result_value
(
self
,
keyindex
,
dialect
):
def
process_result_value
(
self
,
keyindex
,
dialect
):
for
keyname
,
key
in
INDEXED_HYPERDATA
:
for
keyname
,
key
subhash
in
INDEXED_HYPERDATA
.
items
()
:
if
key
[
'id'
]
==
keyindex
:
if
key
subhash
[
'id'
]
==
keyindex
:
return
keyname
return
keyname
raise
ValueError
(
'Hyperdata key with id=
%
d was not found in `gargantext.constants.
NODETYPES
`'
%
keyindex
)
raise
ValueError
(
'Hyperdata key with id=
%
d was not found in `gargantext.constants.
INDEXED_HYPERDATA
`'
%
keyindex
)
class
NodeHyperdata
(
Base
):
class
NodeHyperdata
(
Base
):
...
@@ -85,6 +85,7 @@ class NodeHyperdata(Base):
...
@@ -85,6 +85,7 @@ class NodeHyperdata(Base):
# value
# value
self
.
value
=
value
self
.
value
=
value
# FIXME
@
property
@
property
def
value
(
self
):
def
value
(
self
):
"""Pseudo-attribute used to extract the value in the right format.
"""Pseudo-attribute used to extract the value in the right format.
...
@@ -123,6 +124,7 @@ def HyperdataValueComparer_overrider(key):
...
@@ -123,6 +124,7 @@ def HyperdataValueComparer_overrider(key):
if
isinstance
(
args
[
0
],
str
):
if
isinstance
(
args
[
0
],
str
):
return
getattr
(
NodeHyperdata
.
value_str
,
key
)(
*
args
)
return
getattr
(
NodeHyperdata
.
value_str
,
key
)(
*
args
)
return
comparator
return
comparator
# ??
for
key
in
set
(
dir
(
NodeHyperdata
.
value_flt
)
+
dir
(
NodeHyperdata
.
value_str
)):
for
key
in
set
(
dir
(
NodeHyperdata
.
value_flt
)
+
dir
(
NodeHyperdata
.
value_str
)):
if
key
in
(
'__dict__'
,
'__weakref__'
,
'__repr__'
,
'__str__'
)
or
'attr'
in
key
or
'class'
in
key
or
'init'
in
key
or
'new'
in
key
:
if
key
in
(
'__dict__'
,
'__weakref__'
,
'__repr__'
,
'__str__'
)
or
'attr'
in
key
or
'class'
in
key
or
'init'
in
key
or
'new'
in
key
:
continue
continue
...
...
gargantext/util/parsers/_Parser.py
View file @
9ab3433d
...
@@ -78,6 +78,7 @@ class Parser:
...
@@ -78,6 +78,7 @@ class Parser:
except
:
except
:
pass
pass
else
:
else
:
print
(
"WARNING: Date unknown at _Parser level, using now()"
)
hyperdata
[
'publication_date'
]
=
datetime
.
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
hyperdata
[
'publication_date'
]
=
datetime
.
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
# ...then parse all the "date" fields, to parse it into separate elements
# ...then parse all the "date" fields, to parse it into separate elements
...
...
gargantext/util/toolchain/hyperdata_indexing.py
View file @
9ab3433d
from
gargantext.util.db
import
bulk_insert
from
gargantext.util.db
import
bulk_insert
from
gargantext.constants
import
INDEXED_HYPERDATA
from
gargantext.constants
import
INDEXED_HYPERDATA
from
gargantext.models
import
NodeHyperdata
from
gargantext.models
import
NodeHyperdata
from
datetime
import
datetime
def
_nodes_hyperdata_generator
(
corpus
):
def
_nodes_hyperdata_generator
(
corpus
):
"""This method generates columns for insertions in `nodes_hyperdata`.
"""This method generates columns for insertions in `nodes_hyperdata`.
In case one of the values is a list, its items are iterated over and
In case one of the values is a list, its items are iterated over and
yielded separately.
yielded separately.
If its a string (eg date) it will be truncated to 255 chars
"""
"""
for
document
in
corpus
.
children
(
typename
=
'DOCUMENT'
):
for
document
in
corpus
.
children
(
typename
=
'DOCUMENT'
):
for
keyname
,
key
in
INDEXED_HYPERDATA
.
items
():
for
keyname
,
key
in
INDEXED_HYPERDATA
.
items
():
...
@@ -29,6 +30,16 @@ def _nodes_hyperdata_generator(corpus):
...
@@ -29,6 +30,16 @@ def _nodes_hyperdata_generator(corpus):
None
,
None
,
value
[:
255
],
value
[:
255
],
)
)
elif
isinstance
(
value
,
(
datetime
,
)):
yield
(
document
.
id
,
key
[
'id'
],
None
,
# value_str
value
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
),
)
else
:
print
(
"WARNING: Couldn't insert an INDEXED_HYPERDATA value because of unknown type:"
,
type
(
value
))
def
index_hyperdata
(
corpus
):
def
index_hyperdata
(
corpus
):
...
...
gargantext/util/toolchain/ngram_coocs.py
View file @
9ab3433d
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNgramNgram
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNgramNgram
,
\
NodeHyperdata
from
gargantext.util.lists
import
WeightedMatrix
from
gargantext.util.lists
import
WeightedMatrix
from
gargantext.util.db
import
session
,
aliased
,
func
from
gargantext.util.db
import
session
,
aliased
,
func
from
gargantext.util.db_cache
import
cache
from
gargantext.util.db_cache
import
cache
from
gargantext.constants
import
DEFAULT_COOC_THRESHOLD
from
gargantext.constants
import
DEFAULT_COOC_THRESHOLD
from
datetime
import
datetime
def
compute_coocs
(
corpus
,
def
compute_coocs
(
corpus
,
overwrite_id
=
None
,
overwrite_id
=
None
,
threshold
=
DEFAULT_COOC_THRESHOLD
,
threshold
=
DEFAULT_COOC_THRESHOLD
,
mainlist_id
=
None
,
mainlist_id
=
None
,
stoplist_id
=
None
,
stoplist_id
=
None
,
start
=
None
,
end
=
None
,
symmetry_filter
=
True
):
symmetry_filter
=
True
):
"""
"""
Count how often some extracted terms appear
Count how often some extracted terms appear
...
@@ -40,6 +44,10 @@ def compute_coocs(corpus,
...
@@ -40,6 +44,10 @@ def compute_coocs(corpus,
- mainlist_id: mainlist to constrain the input ngrams
- mainlist_id: mainlist to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams
- stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is provided)
(normally unnecessary if a mainlist is provided)
- start, end: provide one or both temporal limits to filter on doc date
NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow
this convention: "2001-01-01" aka "
%
Y-
%
m-
%
d")
(deprecated parameters)
(deprecated parameters)
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
...
@@ -68,7 +76,6 @@ def compute_coocs(corpus,
...
@@ -68,7 +76,6 @@ def compute_coocs(corpus,
# - TODO cvalue_id: allow a metric as additional input filter
# - TODO cvalue_id: allow a metric as additional input filter
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO start, end : filter on document date
# - TODO weighted: if False normal cooc to be saved as result
# - TODO weighted: if False normal cooc to be saved as result
# if True weighted cooc (experimental)
# if True weighted cooc (experimental)
...
@@ -127,6 +134,42 @@ def compute_coocs(corpus,
...
@@ -127,6 +134,42 @@ def compute_coocs(corpus,
.
filter
(
~
x2
.
ngram_id
.
in_
(
stop_subquery
)
)
.
filter
(
~
x2
.
ngram_id
.
in_
(
stop_subquery
)
)
)
)
if
start
:
if
isinstance
(
start
,
datetime
):
start_str
=
start
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
else
:
start_str
=
str
(
start
)
# doc_ids matching this limit
starttime_subquery
=
(
session
.
query
(
NodeHyperdata
.
node_id
)
.
filter
(
NodeHyperdata
.
key
==
"publication_date"
)
.
filter
(
NodeHyperdata
.
value_str
>=
start_str
)
.
subquery
()
)
# direct use of str comparison op because there is consistency b/w
# sql alpha sort and chrono sort *in this format %Y-%m-%d %H:%M:%S*
# the filtering by start limit
coocs_query
=
coocs_query
.
filter
(
x1
.
node_id
.
in_
(
starttime_subquery
))
if
end
:
if
isinstance
(
end
,
datetime
):
end_str
=
end
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
else
:
end_str
=
str
(
end
)
endtime_subquery
=
(
session
.
query
(
NodeHyperdata
.
node_id
)
.
filter
(
NodeHyperdata
.
key
==
"publication_date"
)
.
filter
(
NodeHyperdata
.
value_str
<=
end_str
)
.
subquery
()
)
# the filtering by end limit
coocs_query
=
coocs_query
.
filter
(
x1
.
node_id
.
in_
(
endtime_subquery
))
if
symmetry_filter
:
if
symmetry_filter
:
# 1 filtre tenant en compte de la symétrie
# 1 filtre tenant en compte de la symétrie
# -> réduit le travail de moitié !!
# -> réduit le travail de moitié !!
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment