Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
147
Issues
147
List
Board
Labels
Milestones
Merge Requests
10
Merge Requests
10
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
4bf4c2d6
Commit
4bf4c2d6
authored
Jun 27, 2019
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] New ngrams list + fix isidore api.
parent
7e77066f
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
104 additions
and
40 deletions
+104
-40
Tools.hs
src/Gargantext/API/Ngrams/Tools.hs
+9
-3
Metrics.hs
src/Gargantext/Database/Metrics.hs
+4
-4
List.hs
src/Gargantext/Text/List.hs
+74
-17
Metrics.hs
src/Gargantext/Text/Metrics.hs
+16
-15
IsidoreApi.hs
src/Gargantext/Text/Parsers/IsidoreApi.hs
+1
-1
No files found.
src/Gargantext/API/Ngrams/Tools.hs
View file @
4bf4c2d6
...
...
@@ -85,13 +85,19 @@ groupNodesByNgrams syn occs = Map.fromListWith (<>) occs'
data
Diagonal
=
Diagonal
Bool
getCoocByNgrams
::
Diagonal
->
Map
Text
(
Set
NodeId
)
->
Map
(
Text
,
Text
)
Int
getCoocByNgrams
(
Diagonal
diag
)
m
=
getCoocByNgrams
=
getCoocByNgrams'
identity
getCoocByNgrams'
::
(
Ord
a
,
Ord
c
)
=>
(
b
->
Set
c
)
->
Diagonal
->
Map
a
b
->
Map
(
a
,
a
)
Int
getCoocByNgrams'
f
(
Diagonal
diag
)
m
=
Map
.
fromList
[((
t1
,
t2
)
,
maybe
0
Set
.
size
$
Set
.
intersection
<$>
Map
.
lookup
t1
m
<*>
Map
.
lookup
t2
m
<$>
(
fmap
f
$
Map
.
lookup
t1
m
)
<*>
(
fmap
f
$
Map
.
lookup
t2
m
)
)
|
(
t1
,
t2
)
<-
case
diag
of
True
->
[
(
x
,
y
)
|
x
<-
Map
.
keys
m
,
y
<-
Map
.
keys
m
,
x
<=
y
]
False
->
listToCombi
identity
(
Map
.
keys
m
)
]
src/Gargantext/Database/Metrics.hs
View file @
4bf4c2d6
...
...
@@ -31,9 +31,9 @@ import Gargantext.Database.Types.Node (ListId, CorpusId{-, HyperdataCorpus-})
--import Gargantext.Database.Flow (getOrMkRootWithCorpus)
import
Gargantext.Database.Config
(
userMaster
)
import
Gargantext.Prelude
import
Gargantext.Text.Metrics
(
scored
,
Scored
(
..
),
localMetrics
{-
, toScored-}
)
import
Gargantext.Text.Metrics
(
scored
,
Scored
(
..
),
{-localMetrics
, toScored-}
)
import
qualified
Data.Map
as
Map
import
qualified
Data.Vector.Storable
as
Vec
--
import qualified Data.Vector.Storable as Vec
getMetrics
::
FlowCmdM
env
err
m
...
...
@@ -59,7 +59,6 @@ getMetrics cId maybeListId tabType maybeLimit = do
metrics' <- getTficfWith cId masterCorpusId (lIds <> [lId]) (ngramsTypeFromTabType tabType) ngs'
pure (ngs , toScored [metrics, Map.fromList $ map (\(a,b) -> (a, Vec.fromList [fst b])) $ Map.toList metrics'])
-}
getLocalMetrics :: (FlowCmdM env err m)
=> CorpusId -> Maybe ListId -> TabType -> Maybe Limit
...
...
@@ -70,6 +69,7 @@ getLocalMetrics :: (FlowCmdM env err m)
getLocalMetrics cId maybeListId tabType maybeLimit = do
(ngs, ngs', myCooc) <- getNgramsCooc cId maybeListId tabType maybeLimit
pure (ngs, ngs', localMetrics myCooc)
-}
getNgramsCooc
::
(
FlowCmdM
env
err
m
)
...
...
src/Gargantext/Text/List.hs
View file @
4bf4c2d6
...
...
@@ -16,16 +16,20 @@ Portability : POSIX
module
Gargantext.Text.List
where
import
Data.Either
(
partitionEithers
,
Either
(
..
))
import
Debug.Trace
(
trace
)
import
Data.Map
(
Map
)
import
Data.Set
(
Set
)
import
Data.Text
(
Text
)
import
Gargantext.API.Ngrams
(
NgramsElement
,
mkNgramsElement
,
RootParent
(
..
),
mSetFromList
)
import
Gargantext.API.Ngrams.Tools
(
getCoocByNgrams'
,
Diagonal
(
..
))
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Types
(
ListType
(
..
),
MasterCorpusId
,
UserCorpusId
)
import
Gargantext.Core.Types
(
ListType
(
..
),
MasterCorpusId
,
UserCorpusId
,
NodeId
)
import
Gargantext.Database.Metrics.NgramsByNode
(
getTficf'
,
sortTficf
,
ngramsGroup
,
getNodesByNgramsUser
,
groupNodesByNgramsWith
)
import
Gargantext.Database.Schema.Ngrams
(
NgramsType
(
..
))
import
Gargantext.Database.Utils
(
Cmd
)
import
Gargantext.Text.List.Learn
(
Model
(
..
))
import
Gargantext.Text.Metrics
(
takeScored
)
import
Gargantext.Prelude
--import Gargantext.Text.Terms (TermType(..))
import
qualified
Data.Char
as
Char
...
...
@@ -41,6 +45,13 @@ data NgramsListBuilder = BuilderStepO { stemSize :: Int
}
|
BuilderStep1
{
withModel
::
Model
}
|
BuilderStepN
{
withModel
::
Model
}
|
Tficf
{
nlb_lang
::
Lang
,
nlb_group1
::
Int
,
nlb_group2
::
Int
,
nlb_stopSize
::
StopSize
,
nlb_userCorpusId
::
UserCorpusId
,
nlb_masterCorpusId
::
MasterCorpusId
}
data
StopSize
=
StopSize
{
unStopSize
::
Int
}
...
...
@@ -50,6 +61,7 @@ buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorp
->
Cmd
err
(
Map
NgramsType
[
NgramsElement
])
buildNgramsLists
l
n
m
s
uCid
mCid
=
do
ngTerms
<-
buildNgramsTermsList
l
n
m
s
uCid
mCid
--ngTerms <- buildNgramsTermsList' uCid (ngramsGroup l n m) (isStopTerm s . fst) 550 300
othersTerms
<-
mapM
(
buildNgramsOthersList
uCid
identity
)
[
Authors
,
Sources
,
Institutes
]
pure
$
Map
.
unions
$
othersTerms
<>
[
ngTerms
]
...
...
@@ -69,11 +81,54 @@ buildNgramsOthersList uCid groupIt nt = do
)
]
--{-
buildNgramsTermsList'
::
UserCorpusId
->
(
Text
->
Text
)
->
((
Text
,
(
Set
Text
,
Set
NodeId
))
->
Bool
)
->
Int
->
Int
->
Cmd
err
(
Map
NgramsType
[
NgramsElement
])
--}
buildNgramsTermsList'
uCid
groupIt
stop
gls
is
=
do
ngs
<-
groupNodesByNgramsWith
groupIt
<$>
getNodesByNgramsUser
uCid
NgramsTerms
let
(
stops
,
candidates
)
=
partitionEithers
$
map
(
\
t
->
if
stop
t
then
Left
t
else
Right
t
)
$
Map
.
toList
$
Map
.
filter
((
\
s'
->
Set
.
size
s'
>
1
)
.
snd
)
ngs
(
maps
,
candidates'
)
=
takeScored
gls
is
$
getCoocByNgrams'
snd
(
Diagonal
True
)
$
Map
.
fromList
candidates
toList'
t
=
(
fst
t
,
(
fromIntegral
$
Set
.
size
$
snd
$
snd
t
,
fst
$
snd
t
))
(
s
,
c
,
m
)
=
(
stops
,
List
.
filter
(
\
(
k
,
_
)
->
List
.
elem
k
candidates'
)
candidates
,
List
.
filter
(
\
(
k
,
_
)
->
List
.
elem
k
maps
)
candidates
)
let
ngs'
=
List
.
concat
$
map
toNgramsElement
$
map
(
\
t
->
(
StopTerm
,
toList'
t
))
s
<>
map
(
\
t
->
(
CandidateTerm
,
toList'
t
))
c
<>
map
(
\
t
->
(
GraphTerm
,
toList'
t
))
m
pure
$
Map
.
fromList
[(
NgramsTerms
,
ngs'
)]
buildNgramsTermsList
::
Lang
->
Int
->
Int
->
StopSize
->
UserCorpusId
->
MasterCorpusId
->
Cmd
err
(
Map
NgramsType
[
NgramsElement
])
buildNgramsTermsList
l
n
m
s
uCid
mCid
=
do
candidates
<-
sortTficf
<$>
getTficf'
uCid
mCid
NgramsTerms
(
ngramsGroup
l
n
m
)
let
termList
=
toTermList
((
isStopTerm
s
)
.
fst
)
candidates
let
candidatesSize
=
2000
a
=
500
b
=
500
candidatesHead
=
List
.
take
candidatesSize
candidates
candidatesTail
=
List
.
drop
candidatesSize
candidates
termList
=
(
toTermList
a
b
((
isStopTerm
s
)
.
fst
)
candidatesHead
)
<>
(
map
(
toList
((
isStopTerm
s
)
.
fst
)
CandidateTerm
)
candidatesTail
)
let
ngs
=
List
.
concat
$
map
toNgramsElement
termList
pure
$
Map
.
fromList
[(
NgramsTerms
,
ngs
)]
...
...
@@ -94,24 +149,26 @@ toNgramsElement (listType, (_stem, (_score, setNgrams))) =
(
mSetFromList
[]
)
)
children
-- TODO remove hard coded parameters
toTermList
::
(
a
->
Bool
)
->
[
a
]
->
[(
ListType
,
a
)]
toTermList
stop
ns
=
map
(
toTermList'
stop
CandidateTerm
)
xs
<>
map
(
toTermList'
stop
GraphTerm
)
ys
<>
map
(
toTermList'
stop
CandidateTerm
)
zs
where
toTermList'
stop'
l
n
=
case
stop'
n
of
toList
::
(
b
->
Bool
)
->
ListType
->
b
->
(
ListType
,
b
)
toList
stop
l
n
=
case
stop
n
of
True
->
(
StopTerm
,
n
)
False
->
(
l
,
n
)
-- TODO use % of size of list
-- TODO user ML
toTermList
::
Int
->
Int
->
(
a
->
Bool
)
->
[
a
]
->
[(
ListType
,
a
)]
toTermList
_
_
_
[]
=
[]
toTermList
a
b
stop
ns
=
trace
(
"computing toTermList"
)
$
map
(
toList
stop
CandidateTerm
)
xs
<>
map
(
toList
stop
GraphTerm
)
ys
<>
toTermList
a
b
stop
zs
where
xs
=
take
a
ns
ys
=
take
b
$
drop
a
ns
zs
=
drop
b
$
drop
a
ns
ta
=
drop
a
ns
ys
=
take
b
ta
zs
=
drop
b
ta
a
=
300
b
=
350
isStopTerm
::
StopSize
->
Text
->
Bool
isStopTerm
(
StopSize
n
)
x
=
Text
.
length
x
<
n
||
any
isStopChar
(
Text
.
unpack
x
)
...
...
src/Gargantext/Text/Metrics.hs
View file @
4bf4c2d6
...
...
@@ -22,7 +22,7 @@ module Gargantext.Text.Metrics
--import Math.KMeans (kmeans, euclidSq, elements)
--import GHC.Float (exp)
import
Data.Tuple.Extra
(
both
)
import
Data.Map
(
Map
)
import
Data.List.Extra
(
sortOn
)
import
GHC.Real
(
round
)
...
...
@@ -40,21 +40,22 @@ import qualified Data.Vector.Storable as Vec
type
GraphListSize
=
Int
type
InclusionSize
=
Int
toScored
::
Ord
t
=>
[
Map
t
(
Vec
.
Vector
Double
)]
->
[
Scored
t
]
toScored
=
map2scored
{-
toScored' :: Ord t => [Map t (Vec.Vector Double)] -> [Scored t]
toScored' = map2scored
. (pcaReduceTo (Dimension 2))
. (Map.filter (\v -> Vec.length v > 1))
. (Map.unionsWith (<>))
-}
scored
::
Ord
t
=>
Map
(
t
,
t
)
Int
->
[
Scored
t
]
scored
=
map2scored
.
(
pcaReduceTo
(
Dimension
2
))
.
scored2map
where
scored2map
::
Ord
t
=>
Map
(
t
,
t
)
Int
->
Map
t
(
Vec
.
Vector
Double
)
scored2map
m
=
Map
.
fromList
$
map
(
\
(
Scored
t
i
s
)
->
(
t
,
Vec
.
fromList
[
i
,
s
]))
$
scored'
m
scored2map
::
Ord
t
=>
Map
(
t
,
t
)
Int
->
Map
t
(
Vec
.
Vector
Double
)
scored2map
m
=
Map
.
fromList
$
map
(
\
(
Scored
t
i
s
)
->
(
t
,
Vec
.
fromList
[
i
,
s
]))
$
scored'
m
map2scored
::
Ord
t
=>
Map
t
(
Vec
.
Vector
Double
)
->
[
Scored
t
]
map2scored
=
map
(
\
(
t
,
ds
)
->
Scored
t
(
Vec
.
head
ds
)
(
Vec
.
last
ds
))
.
Map
.
toList
map2scored
::
Ord
t
=>
Map
t
(
Vec
.
Vector
Double
)
->
[
Scored
t
]
map2scored
=
map
(
\
(
t
,
ds
)
->
Scored
t
(
Vec
.
head
ds
)
(
Vec
.
last
ds
))
.
Map
.
toList
-- TODO change type with (x,y)
data
Scored
ts
=
Scored
...
...
@@ -63,8 +64,8 @@ data Scored ts = Scored
,
_scored_speGen
::
!
SpecificityGenericity
}
deriving
(
Show
)
localMetrics
::
Ord
t
=>
Map
(
t
,
t
)
Int
->
Map
t
(
Vec
.
Vector
Double
)
localMetrics
m
=
Map
.
fromList
$
zipWith
(
\
(
_
,
t
)
(
inc
,
spe
)
->
(
t
,
Vec
.
fromList
[
inc
,
spe
]))
localMetrics
'
::
Ord
t
=>
Map
(
t
,
t
)
Int
->
Map
t
(
Vec
.
Vector
Double
)
localMetrics
'
m
=
Map
.
fromList
$
zipWith
(
\
(
_
,
t
)
(
inc
,
spe
)
->
(
t
,
Vec
.
fromList
[
inc
,
spe
]))
(
Map
.
toList
fi
)
scores
where
...
...
@@ -88,8 +89,8 @@ scored' m = zipWith (\(_,t) (inc,spe) -> Scored t (inc) (spe)) (Map.toList fi) s
$
DAA
.
zip
(
DAA
.
use
is
)
(
DAA
.
use
ss
)
takeScored
::
Ord
t
=>
GraphListSize
->
InclusionSize
->
Map
(
t
,
t
)
Int
->
[
t
]
takeScored
listSize
incSize
=
map
_scored_terms
takeScored
::
Ord
t
=>
GraphListSize
->
InclusionSize
->
Map
(
t
,
t
)
Int
->
([
t
],[
t
])
takeScored
listSize
incSize
=
both
(
map
_scored_terms
)
.
linearTakes
listSize
incSize
_scored_speGen
_scored_incExc
.
scored
...
...
@@ -100,8 +101,8 @@ takeScored listSize incSize = map _scored_terms
-- [(3,8),(6,5)]
linearTakes
::
(
Ord
b1
,
Ord
b2
)
=>
GraphListSize
->
InclusionSize
->
(
a
->
b2
)
->
(
a
->
b1
)
->
[
a
]
->
[
a
]
linearTakes
gls
incSize
speGen
incExc
=
take
gls
->
(
a
->
b2
)
->
(
a
->
b1
)
->
[
a
]
->
([
a
],[
a
])
linearTakes
gls
incSize
speGen
incExc
=
(
List
.
splitAt
gls
)
.
List
.
concat
.
map
(
take
$
round
$
(
fromIntegral
gls
::
Double
)
...
...
src/Gargantext/Text/Parsers/IsidoreApi.hs
View file @
4bf4c2d6
...
...
@@ -78,8 +78,8 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
Nothing
Nothing
(
Just
$
cleanText
$
langText
t
)
Nothing
(
creator2text
<$>
as
)
Nothing
(
_sourceName
<$>
s
)
(
cleanText
<$>
langText
<$>
a
)
(
fmap
(
Text
.
pack
.
show
)
utcTime
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment