Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
haskell-gargantext
Commits
4bf4c2d6
Commit
4bf4c2d6
authored
Jun 27, 2019
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] New ngrams list + fix isidore api.
parent
7e77066f
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
104 additions
and
40 deletions
+104
-40
Tools.hs
src/Gargantext/API/Ngrams/Tools.hs
+9
-3
Metrics.hs
src/Gargantext/Database/Metrics.hs
+4
-4
List.hs
src/Gargantext/Text/List.hs
+74
-17
Metrics.hs
src/Gargantext/Text/Metrics.hs
+16
-15
IsidoreApi.hs
src/Gargantext/Text/Parsers/IsidoreApi.hs
+1
-1
No files found.
src/Gargantext/API/Ngrams/Tools.hs
View file @
4bf4c2d6
...
...
@@ -85,13 +85,19 @@ groupNodesByNgrams syn occs = Map.fromListWith (<>) occs'
data
Diagonal
=
Diagonal
Bool
getCoocByNgrams
::
Diagonal
->
Map
Text
(
Set
NodeId
)
->
Map
(
Text
,
Text
)
Int
getCoocByNgrams
(
Diagonal
diag
)
m
=
getCoocByNgrams
=
getCoocByNgrams'
identity
getCoocByNgrams'
::
(
Ord
a
,
Ord
c
)
=>
(
b
->
Set
c
)
->
Diagonal
->
Map
a
b
->
Map
(
a
,
a
)
Int
getCoocByNgrams'
f
(
Diagonal
diag
)
m
=
Map
.
fromList
[((
t1
,
t2
)
,
maybe
0
Set
.
size
$
Set
.
intersection
<$>
Map
.
lookup
t1
m
<*>
Map
.
lookup
t2
m
<$>
(
fmap
f
$
Map
.
lookup
t1
m
)
<*>
(
fmap
f
$
Map
.
lookup
t2
m
)
)
|
(
t1
,
t2
)
<-
case
diag
of
True
->
[
(
x
,
y
)
|
x
<-
Map
.
keys
m
,
y
<-
Map
.
keys
m
,
x
<=
y
]
False
->
listToCombi
identity
(
Map
.
keys
m
)
]
src/Gargantext/Database/Metrics.hs
View file @
4bf4c2d6
...
...
@@ -31,16 +31,16 @@ import Gargantext.Database.Types.Node (ListId, CorpusId{-, HyperdataCorpus-})
--import Gargantext.Database.Flow (getOrMkRootWithCorpus)
import
Gargantext.Database.Config
(
userMaster
)
import
Gargantext.Prelude
import
Gargantext.Text.Metrics
(
scored
,
Scored
(
..
),
localMetrics
{-
, toScored-}
)
import
Gargantext.Text.Metrics
(
scored
,
Scored
(
..
),
{-localMetrics
, toScored-}
)
import
qualified
Data.Map
as
Map
import
qualified
Data.Vector.Storable
as
Vec
--
import qualified Data.Vector.Storable as Vec
getMetrics
::
FlowCmdM
env
err
m
=>
CorpusId
->
Maybe
ListId
->
TabType
->
Maybe
Limit
->
m
(
Map
Text
(
ListType
,
Maybe
Text
),
[
Scored
Text
])
getMetrics
cId
maybeListId
tabType
maybeLimit
=
do
(
ngs
,
_
,
myCooc
)
<-
getNgramsCooc
cId
maybeListId
tabType
maybeLimit
(
ngs
,
_
,
myCooc
)
<-
getNgramsCooc
cId
maybeListId
tabType
maybeLimit
pure
(
ngs
,
scored
myCooc
)
...
...
@@ -59,7 +59,6 @@ getMetrics cId maybeListId tabType maybeLimit = do
metrics' <- getTficfWith cId masterCorpusId (lIds <> [lId]) (ngramsTypeFromTabType tabType) ngs'
pure (ngs , toScored [metrics, Map.fromList $ map (\(a,b) -> (a, Vec.fromList [fst b])) $ Map.toList metrics'])
-}
getLocalMetrics :: (FlowCmdM env err m)
=> CorpusId -> Maybe ListId -> TabType -> Maybe Limit
...
...
@@ -70,6 +69,7 @@ getLocalMetrics :: (FlowCmdM env err m)
getLocalMetrics cId maybeListId tabType maybeLimit = do
(ngs, ngs', myCooc) <- getNgramsCooc cId maybeListId tabType maybeLimit
pure (ngs, ngs', localMetrics myCooc)
-}
getNgramsCooc
::
(
FlowCmdM
env
err
m
)
...
...
src/Gargantext/Text/List.hs
View file @
4bf4c2d6
...
...
@@ -16,16 +16,20 @@ Portability : POSIX
module
Gargantext.Text.List
where
import
Data.Either
(
partitionEithers
,
Either
(
..
))
import
Debug.Trace
(
trace
)
import
Data.Map
(
Map
)
import
Data.Set
(
Set
)
import
Data.Text
(
Text
)
import
Gargantext.API.Ngrams
(
NgramsElement
,
mkNgramsElement
,
RootParent
(
..
),
mSetFromList
)
import
Gargantext.API.Ngrams.Tools
(
getCoocByNgrams'
,
Diagonal
(
..
))
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Types
(
ListType
(
..
),
MasterCorpusId
,
UserCorpusId
)
import
Gargantext.Core.Types
(
ListType
(
..
),
MasterCorpusId
,
UserCorpusId
,
NodeId
)
import
Gargantext.Database.Metrics.NgramsByNode
(
getTficf'
,
sortTficf
,
ngramsGroup
,
getNodesByNgramsUser
,
groupNodesByNgramsWith
)
import
Gargantext.Database.Schema.Ngrams
(
NgramsType
(
..
))
import
Gargantext.Database.Utils
(
Cmd
)
import
Gargantext.Text.List.Learn
(
Model
(
..
))
import
Gargantext.Text.Metrics
(
takeScored
)
import
Gargantext.Prelude
--import Gargantext.Text.Terms (TermType(..))
import
qualified
Data.Char
as
Char
...
...
@@ -41,6 +45,13 @@ data NgramsListBuilder = BuilderStepO { stemSize :: Int
}
|
BuilderStep1
{
withModel
::
Model
}
|
BuilderStepN
{
withModel
::
Model
}
|
Tficf
{
nlb_lang
::
Lang
,
nlb_group1
::
Int
,
nlb_group2
::
Int
,
nlb_stopSize
::
StopSize
,
nlb_userCorpusId
::
UserCorpusId
,
nlb_masterCorpusId
::
MasterCorpusId
}
data
StopSize
=
StopSize
{
unStopSize
::
Int
}
...
...
@@ -50,6 +61,7 @@ buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorp
->
Cmd
err
(
Map
NgramsType
[
NgramsElement
])
buildNgramsLists
l
n
m
s
uCid
mCid
=
do
ngTerms
<-
buildNgramsTermsList
l
n
m
s
uCid
mCid
--ngTerms <- buildNgramsTermsList' uCid (ngramsGroup l n m) (isStopTerm s . fst) 550 300
othersTerms
<-
mapM
(
buildNgramsOthersList
uCid
identity
)
[
Authors
,
Sources
,
Institutes
]
pure
$
Map
.
unions
$
othersTerms
<>
[
ngTerms
]
...
...
@@ -69,11 +81,54 @@ buildNgramsOthersList uCid groupIt nt = do
)
]
--{-
buildNgramsTermsList'
::
UserCorpusId
->
(
Text
->
Text
)
->
((
Text
,
(
Set
Text
,
Set
NodeId
))
->
Bool
)
->
Int
->
Int
->
Cmd
err
(
Map
NgramsType
[
NgramsElement
])
--}
buildNgramsTermsList'
uCid
groupIt
stop
gls
is
=
do
ngs
<-
groupNodesByNgramsWith
groupIt
<$>
getNodesByNgramsUser
uCid
NgramsTerms
let
(
stops
,
candidates
)
=
partitionEithers
$
map
(
\
t
->
if
stop
t
then
Left
t
else
Right
t
)
$
Map
.
toList
$
Map
.
filter
((
\
s'
->
Set
.
size
s'
>
1
)
.
snd
)
ngs
(
maps
,
candidates'
)
=
takeScored
gls
is
$
getCoocByNgrams'
snd
(
Diagonal
True
)
$
Map
.
fromList
candidates
toList'
t
=
(
fst
t
,
(
fromIntegral
$
Set
.
size
$
snd
$
snd
t
,
fst
$
snd
t
))
(
s
,
c
,
m
)
=
(
stops
,
List
.
filter
(
\
(
k
,
_
)
->
List
.
elem
k
candidates'
)
candidates
,
List
.
filter
(
\
(
k
,
_
)
->
List
.
elem
k
maps
)
candidates
)
let
ngs'
=
List
.
concat
$
map
toNgramsElement
$
map
(
\
t
->
(
StopTerm
,
toList'
t
))
s
<>
map
(
\
t
->
(
CandidateTerm
,
toList'
t
))
c
<>
map
(
\
t
->
(
GraphTerm
,
toList'
t
))
m
pure
$
Map
.
fromList
[(
NgramsTerms
,
ngs'
)]
buildNgramsTermsList
::
Lang
->
Int
->
Int
->
StopSize
->
UserCorpusId
->
MasterCorpusId
->
Cmd
err
(
Map
NgramsType
[
NgramsElement
])
buildNgramsTermsList
l
n
m
s
uCid
mCid
=
do
candidates
<-
sortTficf
<$>
getTficf'
uCid
mCid
NgramsTerms
(
ngramsGroup
l
n
m
)
let
termList
=
toTermList
((
isStopTerm
s
)
.
fst
)
candidates
let
candidatesSize
=
2000
a
=
500
b
=
500
candidatesHead
=
List
.
take
candidatesSize
candidates
candidatesTail
=
List
.
drop
candidatesSize
candidates
termList
=
(
toTermList
a
b
((
isStopTerm
s
)
.
fst
)
candidatesHead
)
<>
(
map
(
toList
((
isStopTerm
s
)
.
fst
)
CandidateTerm
)
candidatesTail
)
let
ngs
=
List
.
concat
$
map
toNgramsElement
termList
pure
$
Map
.
fromList
[(
NgramsTerms
,
ngs
)]
...
...
@@ -94,24 +149,26 @@ toNgramsElement (listType, (_stem, (_score, setNgrams))) =
(
mSetFromList
[]
)
)
children
-- TODO remove hard coded parameters
toTermList
::
(
a
->
Bool
)
->
[
a
]
->
[(
ListType
,
a
)]
toTermList
stop
ns
=
map
(
toTermList'
stop
CandidateTerm
)
xs
<>
map
(
toTermList'
stop
GraphTerm
)
ys
<>
map
(
toTermList'
stop
CandidateTerm
)
zs
where
toTermList'
stop'
l
n
=
case
stop'
n
of
True
->
(
StopTerm
,
n
)
False
->
(
l
,
n
)
-- TODO use % of size of list
-- TODO user ML
toList
::
(
b
->
Bool
)
->
ListType
->
b
->
(
ListType
,
b
)
toList
stop
l
n
=
case
stop
n
of
True
->
(
StopTerm
,
n
)
False
->
(
l
,
n
)
toTermList
::
Int
->
Int
->
(
a
->
Bool
)
->
[
a
]
->
[(
ListType
,
a
)]
toTermList
_
_
_
[]
=
[]
toTermList
a
b
stop
ns
=
trace
(
"computing toTermList"
)
$
map
(
toList
stop
CandidateTerm
)
xs
<>
map
(
toList
stop
GraphTerm
)
ys
<>
toTermList
a
b
stop
zs
where
xs
=
take
a
ns
ys
=
take
b
$
drop
a
ns
zs
=
drop
b
$
drop
a
ns
ta
=
drop
a
ns
ys
=
take
b
ta
zs
=
drop
b
ta
a
=
300
b
=
350
isStopTerm
::
StopSize
->
Text
->
Bool
isStopTerm
(
StopSize
n
)
x
=
Text
.
length
x
<
n
||
any
isStopChar
(
Text
.
unpack
x
)
...
...
src/Gargantext/Text/Metrics.hs
View file @
4bf4c2d6
...
...
@@ -22,7 +22,7 @@ module Gargantext.Text.Metrics
--import Math.KMeans (kmeans, euclidSq, elements)
--import GHC.Float (exp)
import
Data.Tuple.Extra
(
both
)
import
Data.Map
(
Map
)
import
Data.List.Extra
(
sortOn
)
import
GHC.Real
(
round
)
...
...
@@ -40,21 +40,22 @@ import qualified Data.Vector.Storable as Vec
type
GraphListSize
=
Int
type
InclusionSize
=
Int
toScored
::
Ord
t
=>
[
Map
t
(
Vec
.
Vector
Double
)]
->
[
Scored
t
]
toScored
=
map2scored
{-
toScored' :: Ord t => [Map t (Vec.Vector Double)] -> [Scored t]
toScored' = map2scored
. (pcaReduceTo (Dimension 2))
. (Map.filter (\v -> Vec.length v > 1))
. (Map.unionsWith (<>))
-}
scored
::
Ord
t
=>
Map
(
t
,
t
)
Int
->
[
Scored
t
]
scored
=
map2scored
.
(
pcaReduceTo
(
Dimension
2
))
.
scored2map
where
scored2map
::
Ord
t
=>
Map
(
t
,
t
)
Int
->
Map
t
(
Vec
.
Vector
Double
)
scored2map
m
=
Map
.
fromList
$
map
(
\
(
Scored
t
i
s
)
->
(
t
,
Vec
.
fromList
[
i
,
s
]))
$
scored'
m
scored2map
::
Ord
t
=>
Map
(
t
,
t
)
Int
->
Map
t
(
Vec
.
Vector
Double
)
scored2map
m
=
Map
.
fromList
$
map
(
\
(
Scored
t
i
s
)
->
(
t
,
Vec
.
fromList
[
i
,
s
]))
$
scored'
m
map2scored
::
Ord
t
=>
Map
t
(
Vec
.
Vector
Double
)
->
[
Scored
t
]
map2scored
=
map
(
\
(
t
,
ds
)
->
Scored
t
(
Vec
.
head
ds
)
(
Vec
.
last
ds
))
.
Map
.
toList
map2scored
::
Ord
t
=>
Map
t
(
Vec
.
Vector
Double
)
->
[
Scored
t
]
map2scored
=
map
(
\
(
t
,
ds
)
->
Scored
t
(
Vec
.
head
ds
)
(
Vec
.
last
ds
))
.
Map
.
toList
-- TODO change type with (x,y)
data
Scored
ts
=
Scored
...
...
@@ -63,8 +64,8 @@ data Scored ts = Scored
,
_scored_speGen
::
!
SpecificityGenericity
}
deriving
(
Show
)
localMetrics
::
Ord
t
=>
Map
(
t
,
t
)
Int
->
Map
t
(
Vec
.
Vector
Double
)
localMetrics
m
=
Map
.
fromList
$
zipWith
(
\
(
_
,
t
)
(
inc
,
spe
)
->
(
t
,
Vec
.
fromList
[
inc
,
spe
]))
localMetrics
'
::
Ord
t
=>
Map
(
t
,
t
)
Int
->
Map
t
(
Vec
.
Vector
Double
)
localMetrics
'
m
=
Map
.
fromList
$
zipWith
(
\
(
_
,
t
)
(
inc
,
spe
)
->
(
t
,
Vec
.
fromList
[
inc
,
spe
]))
(
Map
.
toList
fi
)
scores
where
...
...
@@ -88,8 +89,8 @@ scored' m = zipWith (\(_,t) (inc,spe) -> Scored t (inc) (spe)) (Map.toList fi) s
$
DAA
.
zip
(
DAA
.
use
is
)
(
DAA
.
use
ss
)
takeScored
::
Ord
t
=>
GraphListSize
->
InclusionSize
->
Map
(
t
,
t
)
Int
->
[
t
]
takeScored
listSize
incSize
=
map
_scored_terms
takeScored
::
Ord
t
=>
GraphListSize
->
InclusionSize
->
Map
(
t
,
t
)
Int
->
([
t
],[
t
])
takeScored
listSize
incSize
=
both
(
map
_scored_terms
)
.
linearTakes
listSize
incSize
_scored_speGen
_scored_incExc
.
scored
...
...
@@ -100,8 +101,8 @@ takeScored listSize incSize = map _scored_terms
-- [(3,8),(6,5)]
linearTakes
::
(
Ord
b1
,
Ord
b2
)
=>
GraphListSize
->
InclusionSize
->
(
a
->
b2
)
->
(
a
->
b1
)
->
[
a
]
->
[
a
]
linearTakes
gls
incSize
speGen
incExc
=
take
gls
->
(
a
->
b2
)
->
(
a
->
b1
)
->
[
a
]
->
([
a
],[
a
])
linearTakes
gls
incSize
speGen
incExc
=
(
List
.
splitAt
gls
)
.
List
.
concat
.
map
(
take
$
round
$
(
fromIntegral
gls
::
Double
)
...
...
src/Gargantext/Text/Parsers/IsidoreApi.hs
View file @
4bf4c2d6
...
...
@@ -78,8 +78,8 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
Nothing
Nothing
(
Just
$
cleanText
$
langText
t
)
Nothing
(
creator2text
<$>
as
)
Nothing
(
_sourceName
<$>
s
)
(
cleanText
<$>
langText
<$>
a
)
(
fmap
(
Text
.
pack
.
show
)
utcTime
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment