Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
158
Issues
158
List
Board
Labels
Milestones
Merge Requests
11
Merge Requests
11
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
94a16978
Commit
94a16978
authored
Jul 09, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[CLI COOC] fix cooc behavior, next refacto and newtypes.
parent
7fe6bf9e
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
82 additions
and
48 deletions
+82
-48
Main.hs
bin/gargantext-cli/Main.hs
+65
-32
package.yaml
package.yaml
+2
-0
Count.hs
src/Gargantext/Text/Metrics/Count.hs
+7
-4
Terms.hs
src/Gargantext/Text/Terms.hs
+2
-4
WithList.hs
src/Gargantext/Text/Terms/WithList.hs
+6
-8
No files found.
bin/gargantext-cli/Main.hs
View file @
94a16978
...
@@ -22,17 +22,26 @@ Main specifications to index a corpus with a term list
...
@@ -22,17 +22,26 @@ Main specifications to index a corpus with a term list
module
Main
where
module
Main
where
import
Data.ByteString.Lazy
(
writeFile
)
import
Data.Maybe
(
catMaybes
)
import
Data.Maybe
(
catMaybes
)
import
Data.Text
(
pack
)
import
Data.Text
(
pack
)
import
qualified
Data.Text
as
DT
import
Data.Tuple.Extra
(
both
)
import
qualified
Data.Vector
as
DV
import
qualified
Data.Vector
as
DV
import
qualified
Data.Maybe
as
DMaybe
import
qualified
Data.Maybe
as
DMaybe
import
Control.Monad
(
zipWithM
)
import
Control.Monad
(
zipWithM
)
import
Control.Monad.IO.Class
import
Control.Monad.IO.Class
import
qualified
Data.IntMap
as
DM
import
Data.Map
(
Map
)
import
Data.Map
(
Map
)
import
qualified
Data.IntMap
as
DIM
import
qualified
Data.Map
as
DM
import
GHC.Generics
import
Data.Aeson
import
Data.Text
(
Text
)
import
Data.Text
(
Text
)
import
Data.List
(
cycle
,
concat
,
unwords
)
import
Data.List
(
cycle
,
concat
,
unwords
)
import
Data.List.Split
(
chunksOf
)
import
Data.List.Split
(
chunksOf
)
...
@@ -51,44 +60,42 @@ import Gargantext.Text.Terms.WithList
...
@@ -51,44 +60,42 @@ import Gargantext.Text.Terms.WithList
import
Gargantext.Text.Parsers.CSV
(
readCsv
,
csv_title
,
csv_abstract
,
csv_publication_year
)
import
Gargantext.Text.Parsers.CSV
(
readCsv
,
csv_title
,
csv_abstract
,
csv_publication_year
)
import
Gargantext.Text.List.CSV
(
csvGraphTermList
)
import
Gargantext.Text.List.CSV
(
csvGraphTermList
)
import
Gargantext.Text.Terms
(
terms
)
import
Gargantext.Text.Terms
(
terms
)
import
Gargantext.Text.Metrics.Count
(
coocOn
,
Coocs
)
import
Gargantext.Text.Metrics.Count
(
coocOn
Contexts
,
Coocs
)
mapMP
::
MonadIO
m
=>
(
a
->
m
b
)
->
[
a
]
->
m
[
b
]
------------------------------------------------------------------------
mapMP
f
xs
=
do
-- OUTPUT format
bs
<-
zipWithM
g
(
cycle
"-
\\
|/"
)
xs
liftIO
$
hPutStr
stderr
"
\r
Done
\n
"
pure
bs
where
g
c
x
=
do
liftIO
$
hPutStr
stderr
[
'
\r
'
,
c
]
liftIO
$
hFlush
stderr
f
x
mapConcurrentlyChunked
::
(
a
->
IO
b
)
->
[
a
]
->
IO
[
b
]
data
CoocByYear
=
CoocByYear
{
year
::
Int
mapConcurrentlyChunked
f
ts
=
do
,
coocurrences
::
Map
(
Text
,
Text
)
Coocs
caps
<-
getNumCapabilities
}
deriving
(
Show
,
Generic
)
let
n
=
1
`
max
`
(
length
ts
`
div
`
caps
)
concat
<$>
mapConcurrently
(
mapM
f
)
(
chunksOf
n
ts
)
data
CoocByYears
=
CoocByYears
{
years
::
[
CoocByYear
]
}
deriving
(
Show
,
Generic
)
instance
ToJSON
CoocByYear
instance
ToJSON
CoocByYears
------------------------------------------------------------------------
filterTermsAndCooc
filterTermsAndCooc
::
TermType
Lang
::
Patterns
->
(
Int
,
[
Text
])
->
(
Int
,
[
Text
])
->
IO
(
Map
(
Terms
,
Terms
)
Coocs
)
->
IO
CoocByYear
-- (Int, (Map (Text, Text) Coocs)
)
filterTermsAndCooc
patterns
(
year
,
ts
)
=
do
filterTermsAndCooc
patterns
(
year
,
ts
)
=
do
log
"start"
log
"start"
r
<-
coocOn
identity
<$>
mapM
(
\
x
->
{-log "work" >>-}
terms
patterns
x
)
ts
r
<-
coocOn
Contexts
identity
<$>
mapM
(
\
x
->
{-log "work" >>-}
terms'
patterns
x
)
ts
log
"stop"
log
"stop"
pure
r
pure
$
CoocByYear
year
(
DM
.
mapKeys
(
both
DT
.
unwords
)
r
)
where
where
log
m
=
do
log
m
=
do
tid
<-
myThreadId
tid
<-
myThreadId
(
p
,
_
)
<-
threadCapability
tid
(
p
,
_
)
<-
threadCapability
tid
putStrLn
.
unwords
$
putStrLn
.
unwords
$
[
"filterTermsAndCooc:"
,
m
,
show
year
,
"on proc"
,
show
p
]
[
"filterTermsAndCooc:"
,
m
,
show
year
,
"on proc"
,
show
p
]
--main :: IO [()]
main
::
IO
()
main
=
do
main
=
do
[
corpusFile
,
termListFile
,
_
]
<-
getArgs
[
corpusFile
,
termListFile
,
outputFile
]
<-
getArgs
--corpus :: IO (DM.IntMap [[Text]])
--corpus :: IO (DM.IntMap [[Text]])
corpus
<-
DM
.
fromListWith
(
<>
)
corpus
<-
DM
.
fromListWith
(
<>
)
...
@@ -102,18 +109,44 @@ main = do
...
@@ -102,18 +109,44 @@ main = do
putStrLn
$
show
$
length
termList
putStrLn
$
show
$
length
termList
let
patterns
=
WithList
$
buildPatterns
termList
let
patterns
=
buildPatterns
termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r
<-
mapConcurrently
(
filterTermsAndCooc
patterns
)
(
DM
.
toList
corpus
)
r
<-
mapConcurrently
(
filterTermsAndCooc
patterns
)
(
DM
.
toList
corpus
)
putStrLn
$
show
r
writeFile
outputFile
$
encode
(
CoocByYears
r
)
--writeFile outputFile cooc
------------------------------------------------------------------------
-- | Tools
mapMP
::
MonadIO
m
=>
(
a
->
m
b
)
->
[
a
]
->
m
[
b
]
mapMP
f
xs
=
do
bs
<-
zipWithM
g
(
cycle
"-
\\
|/"
)
xs
liftIO
$
hPutStr
stderr
"
\r
Done
\n
"
pure
bs
where
g
c
x
=
do
liftIO
$
hPutStr
stderr
[
'
\r
'
,
c
]
liftIO
$
hFlush
stderr
f
x
-- | Optimi that need further developments (not used yet)
mapConcurrentlyChunked
::
(
a
->
IO
b
)
->
[
a
]
->
IO
[
b
]
mapConcurrentlyChunked
f
ts
=
do
caps
<-
getNumCapabilities
let
n
=
1
`
max
`
(
length
ts
`
div
`
caps
)
concat
<$>
mapConcurrently
(
mapM
f
)
(
chunksOf
n
ts
)
--terms' :: Patterns -> Text -> Corpus [[Text]]
terms'
pats
txt
=
pure
$
concat
$
extractTermsWithList
pats
txt
testCooc
=
do
-- | TODO Minimal Example
let
patterns
=
WithList
$
buildPatterns
testTermList
--testCooc = do
mapM
(
\
x
->
{-log "work" >>-}
terms
patterns
x
)
$
catMaybes
$
map
(
head
.
snd
)
testCorpus
-- let patterns = buildPatterns testTermList
--mapConcurrently (filterTermsAndCooc patterns) testCorpus
-- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
-- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
testCorpus
::
[(
Int
,
[
Text
])]
testCorpus
::
[(
Int
,
[
Text
])]
...
...
package.yaml
View file @
94a16978
...
@@ -160,8 +160,10 @@ executables:
...
@@ -160,8 +160,10 @@ executables:
-
-O2
-
-O2
-
-Wmissing-signatures
-
-Wmissing-signatures
dependencies
:
dependencies
:
-
aeson
-
async
-
async
-
base
-
base
-
bytestring
-
containers
-
containers
-
gargantext
-
gargantext
-
vector
-
vector
...
...
src/Gargantext/Text/Metrics/Count.hs
View file @
94a16978
...
@@ -73,10 +73,10 @@ type Occs = Int
...
@@ -73,10 +73,10 @@ type Occs = Int
type
Coocs
=
Int
type
Coocs
=
Int
type
Threshold
=
Int
type
Threshold
=
Int
removeApax
::
Threshold
->
Map
(
Label
,
Label
)
Int
->
Map
(
Label
,
Label
)
Int
removeApax
::
Threshold
->
Map
(
[
Text
],
[
Text
])
Int
->
Map
([
Text
],
[
Text
]
)
Int
removeApax
t
=
DMS
.
filter
(
>
t
)
removeApax
t
=
DMS
.
filter
(
>
t
)
cooc
::
[[
Terms
]]
->
Map
(
Label
,
Label
)
Int
cooc
::
[[
Terms
]]
->
Map
(
[
Text
],
[
Text
]
)
Int
cooc
tss
=
coocOnWithLabel
_terms_stem
(
useLabelPolicy
label_policy
)
tss
cooc
tss
=
coocOnWithLabel
_terms_stem
(
useLabelPolicy
label_policy
)
tss
where
where
terms_occs
=
occurrencesOn
_terms_stem
(
List
.
concat
tss
)
terms_occs
=
occurrencesOn
_terms_stem
(
List
.
concat
tss
)
...
@@ -91,12 +91,12 @@ coocOnWithLabel on' policy tss = mapKeys (delta policy) $ coocOn on' tss
...
@@ -91,12 +91,12 @@ coocOnWithLabel on' policy tss = mapKeys (delta policy) $ coocOn on' tss
delta
f
=
f
***
f
delta
f
=
f
***
f
mkLabelPolicy
::
Map
Grouped
(
Map
Terms
Occs
)
->
Map
Grouped
Label
mkLabelPolicy
::
Map
Grouped
(
Map
Terms
Occs
)
->
Map
Grouped
[
Text
]
mkLabelPolicy
=
DMS
.
map
f
where
mkLabelPolicy
=
DMS
.
map
f
where
f
=
_terms_label
.
fst
.
maximumWith
snd
.
DMS
.
toList
f
=
_terms_label
.
fst
.
maximumWith
snd
.
DMS
.
toList
-- TODO use the Foldable instance of Map instead of building a list
-- TODO use the Foldable instance of Map instead of building a list
useLabelPolicy
::
Map
Grouped
Label
->
Grouped
->
Label
useLabelPolicy
::
Map
Grouped
[
Text
]
->
Grouped
->
[
Text
]
useLabelPolicy
m
g
=
case
DMS
.
lookup
g
m
of
useLabelPolicy
m
g
=
case
DMS
.
lookup
g
m
of
Just
label
->
label
Just
label
->
label
Nothing
->
panic
$
"Label of Grouped not found: "
<>
(
pack
$
show
g
)
Nothing
->
panic
$
"Label of Grouped not found: "
<>
(
pack
$
show
g
)
...
@@ -120,6 +120,8 @@ coocOn' fun ts = DMS.fromListWith (+) xs
...
@@ -120,6 +120,8 @@ coocOn' fun ts = DMS.fromListWith (+) xs
,
x
>=
y
,
x
>=
y
]
]
------------------------------------------------------------------------
coocOnContexts
::
(
a
->
[
Text
])
->
[[
a
]]
->
Map
([
Text
],
[
Text
])
Int
coocOnContexts
::
(
a
->
[
Text
])
->
[[
a
]]
->
Map
([
Text
],
[
Text
])
Int
coocOnContexts
fun
=
DMS
.
fromListWith
(
+
)
.
List
.
concat
.
map
(
coocOnSingleContext
fun
)
coocOnContexts
fun
=
DMS
.
fromListWith
(
+
)
.
List
.
concat
.
map
(
coocOnSingleContext
fun
)
...
@@ -132,6 +134,7 @@ coocOnSingleContext fun ts = xs
...
@@ -132,6 +134,7 @@ coocOnSingleContext fun ts = xs
,
y
<-
ts'
,
y
<-
ts'
,
x
>=
y
,
x
>=
y
]
]
------------------------------------------------------------------------
-- | Compute the grouped occurrences (occ)
-- | Compute the grouped occurrences (occ)
...
...
src/Gargantext/Text/Terms.hs
View file @
94a16978
...
@@ -33,7 +33,6 @@ compute graph
...
@@ -33,7 +33,6 @@ compute graph
module
Gargantext.Text.Terms
module
Gargantext.Text.Terms
where
where
import
Data.List
(
concat
)
import
Data.Text
(
Text
)
import
Data.Text
(
Text
)
import
Data.Traversable
import
Data.Traversable
...
@@ -42,10 +41,9 @@ import Gargantext.Core
...
@@ -42,10 +41,9 @@ import Gargantext.Core
import
Gargantext.Core.Types
import
Gargantext.Core.Types
import
Gargantext.Text.Terms.Multi
(
multiterms
)
import
Gargantext.Text.Terms.Multi
(
multiterms
)
import
Gargantext.Text.Terms.Mono
(
monoTerms
)
import
Gargantext.Text.Terms.Mono
(
monoTerms
)
import
Gargantext.Text.Terms.WithList
(
Patterns
,
extractTermsWithList
)
data
TermType
lang
=
Mono
lang
|
Multi
lang
|
MonoMulti
lang
|
WithList
Patterns
data
TermType
lang
=
Mono
lang
|
Multi
lang
|
MonoMulti
lang
group
::
[
Text
]
->
[
Text
]
group
::
[
Text
]
->
[
Text
]
...
@@ -68,6 +66,6 @@ terms :: TermType Lang -> Text -> IO [Terms]
...
@@ -68,6 +66,6 @@ terms :: TermType Lang -> Text -> IO [Terms]
terms
(
Mono
lang
)
txt
=
pure
$
monoTerms
lang
txt
terms
(
Mono
lang
)
txt
=
pure
$
monoTerms
lang
txt
terms
(
Multi
lang
)
txt
=
multiterms
lang
txt
terms
(
Multi
lang
)
txt
=
multiterms
lang
txt
terms
(
MonoMulti
lang
)
txt
=
terms
(
Multi
lang
)
txt
terms
(
MonoMulti
lang
)
txt
=
terms
(
Multi
lang
)
txt
terms
(
WithList
list
)
txt
=
pure
.
concat
$
extractTermsWithList
list
txt
--
terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------
------------------------------------------------------------------------
src/Gargantext/Text/Terms/WithList.hs
View file @
94a16978
...
@@ -21,7 +21,6 @@ import qualified Data.Algorithms.KMP as KMP
...
@@ -21,7 +21,6 @@ import qualified Data.Algorithms.KMP as KMP
import
Data.Text
(
Text
)
import
Data.Text
(
Text
)
import
qualified
Data.IntMap.Strict
as
IntMap
import
qualified
Data.IntMap.Strict
as
IntMap
import
Gargantext.Core.Types
(
Terms
(
..
))
import
Gargantext.Text.Context
import
Gargantext.Text.Context
import
Gargantext.Text.Terms.Mono
(
monoTextsBySentence
)
import
Gargantext.Text.Terms.Mono
(
monoTextsBySentence
)
...
@@ -29,21 +28,20 @@ import Prelude (error)
...
@@ -29,21 +28,20 @@ import Prelude (error)
import
Gargantext.Prelude
import
Gargantext.Prelude
import
Data.List
(
null
,
concatMap
)
import
Data.List
(
null
,
concatMap
)
import
Data.Ord
import
Data.Ord
import
qualified
Data.Set
as
Set
------------------------------------------------------------------------
------------------------------------------------------------------------
data
Pattern
=
Pattern
data
Pattern
=
Pattern
{
_pat_table
::
!
(
KMP
.
Table
Te
rm
)
{
_pat_table
::
!
(
KMP
.
Table
Te
xt
)
,
_pat_length
::
!
Int
,
_pat_length
::
!
Int
,
_pat_terms
::
!
Terms
,
_pat_terms
::
!
[
Text
]
}
}
type
Patterns
=
[
Pattern
]
type
Patterns
=
[
Pattern
]
------------------------------------------------------------------------
------------------------------------------------------------------------
replaceTerms
::
Patterns
->
Sentence
Term
->
Sentence
Terms
replaceTerms
::
Patterns
->
[
Text
]
->
[[
Text
]]
replaceTerms
pats
terms
=
go
0
replaceTerms
pats
terms
=
go
0
where
where
terms_len
=
length
terms
terms_len
=
length
terms
...
@@ -72,8 +70,8 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
...
@@ -72,8 +70,8 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
f
alt
|
""
`
elem
`
alt
=
error
"buildPatterns: ERR1"
f
alt
|
""
`
elem
`
alt
=
error
"buildPatterns: ERR1"
|
null
alt
=
error
"buildPatterns: ERR2"
|
null
alt
=
error
"buildPatterns: ERR2"
|
otherwise
=
|
otherwise
=
Pattern
(
KMP
.
build
alt
)
(
length
alt
)
Pattern
(
KMP
.
build
alt
)
(
length
alt
)
label
(
Terms
label
$
Set
.
empty
)
-- TODO check stems
--
(Terms label $ Set.empty) -- TODO check stems
extractTermsWithList
::
Patterns
->
Text
->
Corpus
Terms
extractTermsWithList
::
Patterns
->
Text
->
Corpus
[
Text
]
extractTermsWithList
pats
=
map
(
replaceTerms
pats
)
.
monoTextsBySentence
extractTermsWithList
pats
=
map
(
replaceTerms
pats
)
.
monoTextsBySentence
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment