Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
195
Issues
195
List
Board
Labels
Milestones
Merge Requests
12
Merge Requests
12
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
b0b6a491
Commit
b0b6a491
authored
Nov 07, 2022
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[ngrams] implemented doc_count column
parent
092d482b
Pipeline
#3368
passed with stage
in 92 minutes and 7 seconds
Changes
6
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
75 additions
and
27 deletions
+75
-27
Search.hs
src/Gargantext/API/Search.hs
+3
-3
Terms.hs
src/Gargantext/Core/Text/Terms.hs
+4
-11
Flow.hs
src/Gargantext/Database/Action/Flow.hs
+1
-3
Search.hs
src/Gargantext/Database/Action/Search.hs
+62
-4
Facet.hs
src/Gargantext/Database/Query/Facet.hs
+2
-2
Join.hs
src/Gargantext/Database/Query/Join.hs
+3
-4
No files found.
src/Gargantext/API/Search.hs
View file @
b0b6a491
...
...
@@ -70,7 +70,7 @@ api nId (SearchQuery q SearchContact) o l order = do
<$>
map
(
toRow
aId
)
<$>
searchInCorpusWithContacts
nId
aId
(
concat
q
)
o
l
order
api
nId
(
SearchQuery
q
SearchDocWithNgrams
)
o
l
order
=
undefined
api
_nId
(
SearchQuery
_q
SearchDocWithNgrams
)
_o
_l
_
order
=
undefined
-----------------------------------------------------------------------
-----------------------------------------------------------------------
...
...
@@ -135,7 +135,7 @@ instance FromJSON SearchResultTypes where
parseJSON
=
genericParseJSON
(
defaultOptions
{
sumEncoding
=
defaultTaggedObject
})
instance
ToJSON
SearchResultTypes
where
toJSON
=
genericToJSON
(
defaultOptions
{
sumEncoding
=
defaultTaggedObject
})
instance
Arbitrary
SearchResultTypes
where
arbitrary
=
do
srd
<-
SearchResultDoc
<$>
arbitrary
...
...
@@ -166,7 +166,7 @@ data Row =
deriving
(
Generic
)
instance
FromJSON
Row
where
parseJSON
=
genericParseJSON
parseJSON
=
genericParseJSON
(
defaultOptions
{
sumEncoding
=
defaultTaggedObject
}
)
instance
ToJSON
Row
where
...
...
src/Gargantext/Core/Text/Terms.hs
View file @
b0b6a491
...
...
@@ -171,17 +171,10 @@ isSimpleNgrams _ = False
-- 'MonoMulti' : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet)
terms
::
TermType
Lang
->
Text
->
IO
[
TermsWithCount
]
terms
tt
txt
=
do
printDebug
"[terms] tt"
tt
printDebug
"[terms] txt"
txt
out
<-
termsNoLog
tt
txt
printDebug
"[terms] out"
out
pure
out
termsNoLog
::
TermType
Lang
->
Text
->
IO
[
TermsWithCount
]
termsNoLog
(
Mono
lang
)
txt
=
pure
$
monoTerms
lang
txt
termsNoLog
(
Multi
lang
)
txt
=
multiterms
lang
txt
termsNoLog
(
MonoMulti
lang
)
txt
=
terms
(
Multi
lang
)
txt
termsNoLog
(
Unsupervised
{
..
})
txt
=
pure
$
termsUnsupervised
(
Unsupervised
{
_tt_model
=
Just
m'
,
..
})
txt
terms
(
Mono
lang
)
txt
=
pure
$
monoTerms
lang
txt
terms
(
Multi
lang
)
txt
=
multiterms
lang
txt
terms
(
MonoMulti
lang
)
txt
=
terms
(
Multi
lang
)
txt
terms
(
Unsupervised
{
..
})
txt
=
pure
$
termsUnsupervised
(
Unsupervised
{
_tt_model
=
Just
m'
,
..
})
txt
where
m'
=
maybe
(
newTries
_tt_ngramsSize
txt
)
identity
_tt_model
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
...
...
src/Gargantext/Database/Action/Flow.hs
View file @
b0b6a491
...
...
@@ -375,7 +375,7 @@ saveDocNgramsWith :: (FlowCmdM env err m)
->
HashMap
ExtractedNgrams
(
Map
NgramsType
(
Map
NodeId
(
Int
,
TermsCount
)))
->
m
()
saveDocNgramsWith
lId
mapNgramsDocs'
=
do
printDebug
"[saveDocNgramsWith] mapNgramsDocs'"
mapNgramsDocs'
--
printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
let
mapNgramsDocsNoCount
=
over
(
traverse
.
traverse
.
traverse
)
fst
mapNgramsDocs'
terms2id
<-
insertExtractedNgrams
$
HashMap
.
keys
mapNgramsDocsNoCount
...
...
@@ -524,8 +524,6 @@ instance ExtractNgramsT HyperdataDocument
termsWithCounts'
<-
map
(
\
(
t
,
cnt
)
->
(
enrichedTerms
(
lang'
^.
tt_lang
)
CoreNLP
NP
t
,
cnt
))
<$>
concat
<$>
liftBase
(
extractTerms
lang'
$
hasText
doc
)
printDebug
"[extractNgramsT HyperdataDocument] termsWithCounts'"
termsWithCounts'
printDebug
"[extractNgramsT HyperdataDocument] termsWithLargerCounts"
$
filter
(
\
(
_
,
cnt
)
->
cnt
>
1
)
termsWithCounts'
pure
$
HashMap
.
fromList
$
[(
SimpleNgrams
source
,
(
Map
.
singleton
Sources
1
,
1
))
]
...
...
src/Gargantext/Database/Action/Search.hs
View file @
b0b6a491
...
...
@@ -14,7 +14,10 @@ module Gargantext.Database.Action.Search where
import
Control.Arrow
(
returnA
)
import
Control.Lens
((
^.
))
import
qualified
Data.List
as
List
import
qualified
Data.Map
as
Map
import
Data.Maybe
import
qualified
Data.Set
as
Set
import
Data.Text
(
Text
,
unpack
,
intercalate
)
import
Data.Time
(
UTCTime
)
import
Gargantext.Core
...
...
@@ -26,9 +29,11 @@ import Gargantext.Database.Query.Filter
import
Gargantext.Database.Query.Join
(
leftJoin5
)
import
Gargantext.Database.Query.Table.Node
import
Gargantext.Database.Query.Table.Context
import
Gargantext.Database.Query.Table.ContextNodeNgrams
(
queryContextNodeNgramsTable
)
import
Gargantext.Database.Query.Table.NodeContext
import
Gargantext.Database.Schema.Ngrams
(
NgramsType
(
..
))
import
Gargantext.Database.Query.Table.NodeContext_NodeContext
import
Gargantext.Database.Schema.ContextNodeNgrams
(
ContextNodeNgramsPoly
(
..
))
import
Gargantext.Database.Schema.Ngrams
(
NgramsType
(
..
))
import
Gargantext.Database.Schema.Node
import
Gargantext.Database.Schema.Context
import
Gargantext.Prelude
...
...
@@ -44,7 +49,7 @@ searchDocInDatabase :: HasDBid NodeType
->
Cmd
err
[(
NodeId
,
HyperdataDocument
)]
searchDocInDatabase
p
t
=
runOpaQuery
(
queryDocInDatabase
p
t
)
where
-- | Global search query where ParentId is Master Node Corpus Id
-- | Global search query where ParentId is Master Node Corpus Id
queryDocInDatabase
::
ParentId
->
Text
->
O
.
Select
(
Column
SqlInt4
,
Column
SqlJsonb
)
queryDocInDatabase
_p
q
=
proc
()
->
do
row
<-
queryNodeSearchTable
-<
()
...
...
@@ -54,6 +59,8 @@ searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
------------------------------------------------------------------------
-- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
-- search only to map/candidate terms.
searchInCorpusWithNgrams
::
HasDBid
NodeType
=>
CorpusId
->
ListId
...
...
@@ -64,7 +71,59 @@ searchInCorpusWithNgrams :: HasDBid NodeType
->
Maybe
Limit
->
Maybe
OrderBy
->
Cmd
err
[
FacetDoc
]
searchInCorpusWithNgrams
cId
lId
t
ngt
q
o
l
order
=
undefined
searchInCorpusWithNgrams
_cId
_lId
_t
_ngt
_q
_o
_l
_order
=
undefined
-- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
-- case only the "TF" part makes sense and so we only compute the
-- ratio of "number of times our terms appear in given document" and
-- "number of all terms in document" and return a sorted list of
-- document ids
tfidfAll
::
CorpusId
->
[
Int
]
->
Cmd
err
[
Int
]
tfidfAll
cId
ngramIds
=
do
let
ngramIdsSet
=
Set
.
fromList
ngramIds
docsWithNgrams
<-
runOpaQuery
(
queryCorpusWithNgrams
cId
ngramIds
)
::
Cmd
err
[(
Int
,
Int
,
Int
)]
-- NOTE The query returned docs with ANY ngramIds. We need to further
-- restrict to ALL ngramIds.
let
docsNgramsM
=
Map
.
fromListWith
(
Set
.
union
)
[
(
ctxId
,
Set
.
singleton
ngrams_id
)
|
(
ctxId
,
ngrams_id
,
_
)
<-
docsWithNgrams
]
let
docsWithAllNgramsS
=
Set
.
fromList
$
List
.
map
fst
$
List
.
filter
(
\
(
_
,
docNgrams
)
->
ngramIdsSet
==
Set
.
intersection
ngramIdsSet
docNgrams
)
$
Map
.
toList
docsNgramsM
let
docsWithAllNgrams
=
List
.
filter
(
\
(
ctxId
,
_
,
_
)
->
Set
.
member
ctxId
docsWithAllNgramsS
)
docsWithNgrams
printDebug
"[tfidfAll] docsWithAllNgrams"
docsWithAllNgrams
let
docsWithCounts
=
Map
.
fromListWith
(
+
)
[
(
ctxId
,
doc_count
)
|
(
ctxId
,
_
,
doc_count
)
<-
docsWithAllNgrams
]
printDebug
"[tfidfAll] docsWithCounts"
docsWithCounts
let
totals
=
[
(
ctxId
,
ngrams_id
,
fromIntegral
doc_count
::
Double
,
fromIntegral
(
fromMaybe
0
$
Map
.
lookup
ctxId
docsWithCounts
)
::
Double
)
|
(
ctxId
,
ngrams_id
,
doc_count
)
<-
docsWithAllNgrams
]
let
tfidf_sorted
=
List
.
sortOn
snd
[(
ctxId
,
doc_count
/
s
)
|
(
ctxId
,
_
,
doc_count
,
s
)
<-
totals
]
pure
$
List
.
map
fst
$
List
.
reverse
tfidf_sorted
-- | Query for searching the 'context_node_ngrams' table so that we
-- find docs with ANY given 'ngramIds'.
queryCorpusWithNgrams
::
CorpusId
->
[
Int
]
->
Select
(
Column
SqlInt4
,
Column
SqlInt4
,
Column
SqlInt4
)
queryCorpusWithNgrams
cId
ngramIds
=
proc
()
->
do
row
<-
queryContextNodeNgramsTable
-<
()
restrict
-<
(
_cnng_node_id
row
)
.==
(
pgNodeId
cId
)
restrict
-<
in_
(
sqlInt4
<$>
ngramIds
)
(
_cnng_ngrams_id
row
)
returnA
-<
(
_cnng_context_id
row
,
_cnng_ngrams_id
row
,
_cnng_doc_count
row
)
--returnA -< row
-- returnA -< ( _cnng_context_id row
-- , _cnng_node_id row
-- , _cnng_ngrams_id row
-- , _cnng_ngramsType row
-- , _cnng_weight row
-- , _cnng_doc_count row)
------------------------------------------------------------------------
...
...
@@ -225,4 +284,3 @@ queryContactViaDoc =
)
)
->
Column
SqlBool
cond45
(
doc
,
(
corpus
,
(
_
,(
_
,
_
))))
=
doc
^.
cs_id
.==
corpus
^.
nc_context_id
src/Gargantext/Database/Query/Facet.hs
View file @
b0b6a491
...
...
@@ -103,7 +103,7 @@ data Facet id created title hyperdata category ngramCount score =
,
facetDoc_score
::
score
}
deriving
(
Show
,
Generic
)
{- | TODO after demo
data Facet id date hyperdata score =
data Facet id date hyperdata score =
FacetDoc { facetDoc_id :: id
, facetDoc_date :: date
, facetDoc_hyperdata :: hyperdata
...
...
@@ -163,7 +163,7 @@ type FacetPairedReadNull = FacetPaired (Column (Nullable SqlInt4) )
(
Column
(
Nullable
SqlInt4
)
)
type
FacetPairedReadNullAgg
=
FacetPaired
(
Aggregator
(
Column
(
Nullable
SqlInt4
)
)
(
Column
(
Nullable
SqlInt4
)
)
(
Column
(
Nullable
SqlInt4
)
)
)
(
Aggregator
(
Column
(
Nullable
SqlTimestamptz
))
(
Column
(
Nullable
SqlTimestamptz
))
...
...
src/Gargantext/Database/Query/Join.hs
View file @
b0b6a491
...
...
@@ -43,7 +43,7 @@ import Opaleye.Internal.Join (NullMaker(..))
import
qualified
Opaleye.Internal.Unpackspec
()
keepWhen
::
(
a
->
Field
SqlBool
)
->
SelectArr
a
a
keepWhen
::
(
a
->
Field
SqlBool
)
->
SelectArr
a
a
keepWhen
p
=
proc
a
->
do
restrict
-<
p
a
returnA
-<
a
...
...
@@ -61,7 +61,7 @@ leftJoin2 = leftJoin
------------------------------------------------------------------------
-- | LeftJoin3 in two ways to write it
leftJoin3
::
Select
columnsA
->
Select
columnsB
->
Select
columnsC
->
((
columnsA
,
columnsB
,
columnsC
)
->
Column
SqlBool
)
->
((
columnsA
,
columnsB
,
columnsC
)
->
Column
SqlBool
)
->
Select
(
columnsA
,
columnsB
,
columnsC
)
leftJoin3
q1
q2
q3
cond
=
((,,)
<$>
q1
<*>
q2
<*>
q3
)
>>>
keepWhen
cond
...
...
@@ -82,7 +82,7 @@ leftJoin4' :: Select columnsA
->
Select
columnsB
->
Select
columnsC
->
Select
columnsD
->
((
columnsA
,
columnsB
,
columnsC
,
columnsD
)
->
Column
SqlBool
)
->
((
columnsA
,
columnsB
,
columnsC
,
columnsD
)
->
Column
SqlBool
)
->
Select
(
columnsA
,
columnsB
,
columnsC
,
columnsD
)
leftJoin4'
q1
q2
q3
q4
cond
=
((,,,)
<$>
q1
<*>
q2
<*>
q3
<*>
q4
)
>>>
keepWhen
cond
...
...
@@ -375,4 +375,3 @@ leftJoin9 q1 q2 q3 q4 q5 q6 q7 q8 q9
)
cond67
)
cond78
)
cond89
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment