Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
149
Issues
149
List
Board
Labels
Milestones
Merge Requests
8
Merge Requests
8
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
b0b6a491
Commit
b0b6a491
authored
Nov 07, 2022
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[ngrams] implemented doc_count column
parent
092d482b
Pipeline
#3368
passed with stage
in 92 minutes and 7 seconds
Changes
6
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
75 additions
and
27 deletions
+75
-27
Search.hs
src/Gargantext/API/Search.hs
+3
-3
Terms.hs
src/Gargantext/Core/Text/Terms.hs
+4
-11
Flow.hs
src/Gargantext/Database/Action/Flow.hs
+1
-3
Search.hs
src/Gargantext/Database/Action/Search.hs
+62
-4
Facet.hs
src/Gargantext/Database/Query/Facet.hs
+2
-2
Join.hs
src/Gargantext/Database/Query/Join.hs
+3
-4
No files found.
src/Gargantext/API/Search.hs
View file @
b0b6a491
...
...
@@ -70,7 +70,7 @@ api nId (SearchQuery q SearchContact) o l order = do
<$>
map
(
toRow
aId
)
<$>
searchInCorpusWithContacts
nId
aId
(
concat
q
)
o
l
order
api
nId
(
SearchQuery
q
SearchDocWithNgrams
)
o
l
order
=
undefined
api
_nId
(
SearchQuery
_q
SearchDocWithNgrams
)
_o
_l
_
order
=
undefined
-----------------------------------------------------------------------
-----------------------------------------------------------------------
...
...
@@ -135,7 +135,7 @@ instance FromJSON SearchResultTypes where
parseJSON
=
genericParseJSON
(
defaultOptions
{
sumEncoding
=
defaultTaggedObject
})
instance
ToJSON
SearchResultTypes
where
toJSON
=
genericToJSON
(
defaultOptions
{
sumEncoding
=
defaultTaggedObject
})
instance
Arbitrary
SearchResultTypes
where
arbitrary
=
do
srd
<-
SearchResultDoc
<$>
arbitrary
...
...
@@ -166,7 +166,7 @@ data Row =
deriving
(
Generic
)
instance
FromJSON
Row
where
parseJSON
=
genericParseJSON
parseJSON
=
genericParseJSON
(
defaultOptions
{
sumEncoding
=
defaultTaggedObject
}
)
instance
ToJSON
Row
where
...
...
src/Gargantext/Core/Text/Terms.hs
View file @
b0b6a491
...
...
@@ -171,17 +171,10 @@ isSimpleNgrams _ = False
-- 'MonoMulti' : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet)
terms
::
TermType
Lang
->
Text
->
IO
[
TermsWithCount
]
terms
tt
txt
=
do
printDebug
"[terms] tt"
tt
printDebug
"[terms] txt"
txt
out
<-
termsNoLog
tt
txt
printDebug
"[terms] out"
out
pure
out
termsNoLog
::
TermType
Lang
->
Text
->
IO
[
TermsWithCount
]
termsNoLog
(
Mono
lang
)
txt
=
pure
$
monoTerms
lang
txt
termsNoLog
(
Multi
lang
)
txt
=
multiterms
lang
txt
termsNoLog
(
MonoMulti
lang
)
txt
=
terms
(
Multi
lang
)
txt
termsNoLog
(
Unsupervised
{
..
})
txt
=
pure
$
termsUnsupervised
(
Unsupervised
{
_tt_model
=
Just
m'
,
..
})
txt
terms
(
Mono
lang
)
txt
=
pure
$
monoTerms
lang
txt
terms
(
Multi
lang
)
txt
=
multiterms
lang
txt
terms
(
MonoMulti
lang
)
txt
=
terms
(
Multi
lang
)
txt
terms
(
Unsupervised
{
..
})
txt
=
pure
$
termsUnsupervised
(
Unsupervised
{
_tt_model
=
Just
m'
,
..
})
txt
where
m'
=
maybe
(
newTries
_tt_ngramsSize
txt
)
identity
_tt_model
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
...
...
src/Gargantext/Database/Action/Flow.hs
View file @
b0b6a491
...
...
@@ -375,7 +375,7 @@ saveDocNgramsWith :: (FlowCmdM env err m)
->
HashMap
ExtractedNgrams
(
Map
NgramsType
(
Map
NodeId
(
Int
,
TermsCount
)))
->
m
()
saveDocNgramsWith
lId
mapNgramsDocs'
=
do
printDebug
"[saveDocNgramsWith] mapNgramsDocs'"
mapNgramsDocs'
--
printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
let
mapNgramsDocsNoCount
=
over
(
traverse
.
traverse
.
traverse
)
fst
mapNgramsDocs'
terms2id
<-
insertExtractedNgrams
$
HashMap
.
keys
mapNgramsDocsNoCount
...
...
@@ -524,8 +524,6 @@ instance ExtractNgramsT HyperdataDocument
termsWithCounts'
<-
map
(
\
(
t
,
cnt
)
->
(
enrichedTerms
(
lang'
^.
tt_lang
)
CoreNLP
NP
t
,
cnt
))
<$>
concat
<$>
liftBase
(
extractTerms
lang'
$
hasText
doc
)
printDebug
"[extractNgramsT HyperdataDocument] termsWithCounts'"
termsWithCounts'
printDebug
"[extractNgramsT HyperdataDocument] termsWithLargerCounts"
$
filter
(
\
(
_
,
cnt
)
->
cnt
>
1
)
termsWithCounts'
pure
$
HashMap
.
fromList
$
[(
SimpleNgrams
source
,
(
Map
.
singleton
Sources
1
,
1
))
]
...
...
src/Gargantext/Database/Action/Search.hs
View file @
b0b6a491
...
...
@@ -14,7 +14,10 @@ module Gargantext.Database.Action.Search where
import
Control.Arrow
(
returnA
)
import
Control.Lens
((
^.
))
import
qualified
Data.List
as
List
import
qualified
Data.Map
as
Map
import
Data.Maybe
import
qualified
Data.Set
as
Set
import
Data.Text
(
Text
,
unpack
,
intercalate
)
import
Data.Time
(
UTCTime
)
import
Gargantext.Core
...
...
@@ -26,9 +29,11 @@ import Gargantext.Database.Query.Filter
import
Gargantext.Database.Query.Join
(
leftJoin5
)
import
Gargantext.Database.Query.Table.Node
import
Gargantext.Database.Query.Table.Context
import
Gargantext.Database.Query.Table.ContextNodeNgrams
(
queryContextNodeNgramsTable
)
import
Gargantext.Database.Query.Table.NodeContext
import
Gargantext.Database.Schema.Ngrams
(
NgramsType
(
..
))
import
Gargantext.Database.Query.Table.NodeContext_NodeContext
import
Gargantext.Database.Schema.ContextNodeNgrams
(
ContextNodeNgramsPoly
(
..
))
import
Gargantext.Database.Schema.Ngrams
(
NgramsType
(
..
))
import
Gargantext.Database.Schema.Node
import
Gargantext.Database.Schema.Context
import
Gargantext.Prelude
...
...
@@ -44,7 +49,7 @@ searchDocInDatabase :: HasDBid NodeType
->
Cmd
err
[(
NodeId
,
HyperdataDocument
)]
searchDocInDatabase
p
t
=
runOpaQuery
(
queryDocInDatabase
p
t
)
where
-- | Global search query where ParentId is Master Node Corpus Id
-- | Global search query where ParentId is Master Node Corpus Id
queryDocInDatabase
::
ParentId
->
Text
->
O
.
Select
(
Column
SqlInt4
,
Column
SqlJsonb
)
queryDocInDatabase
_p
q
=
proc
()
->
do
row
<-
queryNodeSearchTable
-<
()
...
...
@@ -54,6 +59,8 @@ searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
------------------------------------------------------------------------
-- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
-- search only to map/candidate terms.
searchInCorpusWithNgrams
::
HasDBid
NodeType
=>
CorpusId
->
ListId
...
...
@@ -64,7 +71,59 @@ searchInCorpusWithNgrams :: HasDBid NodeType
->
Maybe
Limit
->
Maybe
OrderBy
->
Cmd
err
[
FacetDoc
]
searchInCorpusWithNgrams
cId
lId
t
ngt
q
o
l
order
=
undefined
searchInCorpusWithNgrams
_cId
_lId
_t
_ngt
_q
_o
_l
_order
=
undefined
-- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
-- case only the "TF" part makes sense and so we only compute the
-- ratio of "number of times our terms appear in given document" and
-- "number of all terms in document" and return a sorted list of
-- document ids
tfidfAll
::
CorpusId
->
[
Int
]
->
Cmd
err
[
Int
]
tfidfAll
cId
ngramIds
=
do
let
ngramIdsSet
=
Set
.
fromList
ngramIds
docsWithNgrams
<-
runOpaQuery
(
queryCorpusWithNgrams
cId
ngramIds
)
::
Cmd
err
[(
Int
,
Int
,
Int
)]
-- NOTE The query returned docs with ANY ngramIds. We need to further
-- restrict to ALL ngramIds.
let
docsNgramsM
=
Map
.
fromListWith
(
Set
.
union
)
[
(
ctxId
,
Set
.
singleton
ngrams_id
)
|
(
ctxId
,
ngrams_id
,
_
)
<-
docsWithNgrams
]
let
docsWithAllNgramsS
=
Set
.
fromList
$
List
.
map
fst
$
List
.
filter
(
\
(
_
,
docNgrams
)
->
ngramIdsSet
==
Set
.
intersection
ngramIdsSet
docNgrams
)
$
Map
.
toList
docsNgramsM
let
docsWithAllNgrams
=
List
.
filter
(
\
(
ctxId
,
_
,
_
)
->
Set
.
member
ctxId
docsWithAllNgramsS
)
docsWithNgrams
printDebug
"[tfidfAll] docsWithAllNgrams"
docsWithAllNgrams
let
docsWithCounts
=
Map
.
fromListWith
(
+
)
[
(
ctxId
,
doc_count
)
|
(
ctxId
,
_
,
doc_count
)
<-
docsWithAllNgrams
]
printDebug
"[tfidfAll] docsWithCounts"
docsWithCounts
let
totals
=
[
(
ctxId
,
ngrams_id
,
fromIntegral
doc_count
::
Double
,
fromIntegral
(
fromMaybe
0
$
Map
.
lookup
ctxId
docsWithCounts
)
::
Double
)
|
(
ctxId
,
ngrams_id
,
doc_count
)
<-
docsWithAllNgrams
]
let
tfidf_sorted
=
List
.
sortOn
snd
[(
ctxId
,
doc_count
/
s
)
|
(
ctxId
,
_
,
doc_count
,
s
)
<-
totals
]
pure
$
List
.
map
fst
$
List
.
reverse
tfidf_sorted
-- | Query for searching the 'context_node_ngrams' table so that we
-- find docs with ANY given 'ngramIds'.
queryCorpusWithNgrams
::
CorpusId
->
[
Int
]
->
Select
(
Column
SqlInt4
,
Column
SqlInt4
,
Column
SqlInt4
)
queryCorpusWithNgrams
cId
ngramIds
=
proc
()
->
do
row
<-
queryContextNodeNgramsTable
-<
()
restrict
-<
(
_cnng_node_id
row
)
.==
(
pgNodeId
cId
)
restrict
-<
in_
(
sqlInt4
<$>
ngramIds
)
(
_cnng_ngrams_id
row
)
returnA
-<
(
_cnng_context_id
row
,
_cnng_ngrams_id
row
,
_cnng_doc_count
row
)
--returnA -< row
-- returnA -< ( _cnng_context_id row
-- , _cnng_node_id row
-- , _cnng_ngrams_id row
-- , _cnng_ngramsType row
-- , _cnng_weight row
-- , _cnng_doc_count row)
------------------------------------------------------------------------
...
...
@@ -225,4 +284,3 @@ queryContactViaDoc =
)
)
->
Column
SqlBool
cond45
(
doc
,
(
corpus
,
(
_
,(
_
,
_
))))
=
doc
^.
cs_id
.==
corpus
^.
nc_context_id
src/Gargantext/Database/Query/Facet.hs
View file @
b0b6a491
...
...
@@ -103,7 +103,7 @@ data Facet id created title hyperdata category ngramCount score =
,
facetDoc_score
::
score
}
deriving
(
Show
,
Generic
)
{- | TODO after demo
data Facet id date hyperdata score =
data Facet id date hyperdata score =
FacetDoc { facetDoc_id :: id
, facetDoc_date :: date
, facetDoc_hyperdata :: hyperdata
...
...
@@ -163,7 +163,7 @@ type FacetPairedReadNull = FacetPaired (Column (Nullable SqlInt4) )
(
Column
(
Nullable
SqlInt4
)
)
type
FacetPairedReadNullAgg
=
FacetPaired
(
Aggregator
(
Column
(
Nullable
SqlInt4
)
)
(
Column
(
Nullable
SqlInt4
)
)
(
Column
(
Nullable
SqlInt4
)
)
)
(
Aggregator
(
Column
(
Nullable
SqlTimestamptz
))
(
Column
(
Nullable
SqlTimestamptz
))
...
...
src/Gargantext/Database/Query/Join.hs
View file @
b0b6a491
...
...
@@ -43,7 +43,7 @@ import Opaleye.Internal.Join (NullMaker(..))
import
qualified
Opaleye.Internal.Unpackspec
()
keepWhen
::
(
a
->
Field
SqlBool
)
->
SelectArr
a
a
keepWhen
::
(
a
->
Field
SqlBool
)
->
SelectArr
a
a
keepWhen
p
=
proc
a
->
do
restrict
-<
p
a
returnA
-<
a
...
...
@@ -61,7 +61,7 @@ leftJoin2 = leftJoin
------------------------------------------------------------------------
-- | LeftJoin3 in two ways to write it
leftJoin3
::
Select
columnsA
->
Select
columnsB
->
Select
columnsC
->
((
columnsA
,
columnsB
,
columnsC
)
->
Column
SqlBool
)
->
((
columnsA
,
columnsB
,
columnsC
)
->
Column
SqlBool
)
->
Select
(
columnsA
,
columnsB
,
columnsC
)
leftJoin3
q1
q2
q3
cond
=
((,,)
<$>
q1
<*>
q2
<*>
q3
)
>>>
keepWhen
cond
...
...
@@ -82,7 +82,7 @@ leftJoin4' :: Select columnsA
->
Select
columnsB
->
Select
columnsC
->
Select
columnsD
->
((
columnsA
,
columnsB
,
columnsC
,
columnsD
)
->
Column
SqlBool
)
->
((
columnsA
,
columnsB
,
columnsC
,
columnsD
)
->
Column
SqlBool
)
->
Select
(
columnsA
,
columnsB
,
columnsC
,
columnsD
)
leftJoin4'
q1
q2
q3
q4
cond
=
((,,,)
<$>
q1
<*>
q2
<*>
q3
<*>
q4
)
>>>
keepWhen
cond
...
...
@@ -375,4 +375,3 @@ leftJoin9 q1 q2 q3 q4 q5 q6 q7 q8 q9
)
cond67
)
cond78
)
cond89
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment