Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
147
Issues
147
List
Board
Labels
Milestones
Merge Requests
6
Merge Requests
6
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
94a16978
Commit
94a16978
authored
Jul 09, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[CLI COOC] fix cooc behavior, next refacto and newtypes.
parent
7fe6bf9e
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
82 additions
and
48 deletions
+82
-48
Main.hs
bin/gargantext-cli/Main.hs
+65
-32
package.yaml
package.yaml
+2
-0
Count.hs
src/Gargantext/Text/Metrics/Count.hs
+7
-4
Terms.hs
src/Gargantext/Text/Terms.hs
+2
-4
WithList.hs
src/Gargantext/Text/Terms/WithList.hs
+6
-8
No files found.
bin/gargantext-cli/Main.hs
View file @
94a16978
...
...
@@ -22,17 +22,26 @@ Main specifications to index a corpus with a term list
module
Main
where
import
Data.ByteString.Lazy
(
writeFile
)
import
Data.Maybe
(
catMaybes
)
import
Data.Text
(
pack
)
import
qualified
Data.Text
as
DT
import
Data.Tuple.Extra
(
both
)
import
qualified
Data.Vector
as
DV
import
qualified
Data.Maybe
as
DMaybe
import
Control.Monad
(
zipWithM
)
import
Control.Monad.IO.Class
import
qualified
Data.IntMap
as
DM
import
Data.Map
(
Map
)
import
qualified
Data.IntMap
as
DIM
import
qualified
Data.Map
as
DM
import
GHC.Generics
import
Data.Aeson
import
Data.Text
(
Text
)
import
Data.List
(
cycle
,
concat
,
unwords
)
import
Data.List.Split
(
chunksOf
)
...
...
@@ -51,44 +60,42 @@ import Gargantext.Text.Terms.WithList
import
Gargantext.Text.Parsers.CSV
(
readCsv
,
csv_title
,
csv_abstract
,
csv_publication_year
)
import
Gargantext.Text.List.CSV
(
csvGraphTermList
)
import
Gargantext.Text.Terms
(
terms
)
import
Gargantext.Text.Metrics.Count
(
coocOn
,
Coocs
)
import
Gargantext.Text.Metrics.Count
(
coocOn
Contexts
,
Coocs
)
mapMP
::
MonadIO
m
=>
(
a
->
m
b
)
->
[
a
]
->
m
[
b
]
mapMP
f
xs
=
do
bs
<-
zipWithM
g
(
cycle
"-
\\
|/"
)
xs
liftIO
$
hPutStr
stderr
"
\r
Done
\n
"
pure
bs
where
g
c
x
=
do
liftIO
$
hPutStr
stderr
[
'
\r
'
,
c
]
liftIO
$
hFlush
stderr
f
x
------------------------------------------------------------------------
-- OUTPUT format
mapConcurrentlyChunked
::
(
a
->
IO
b
)
->
[
a
]
->
IO
[
b
]
mapConcurrentlyChunked
f
ts
=
do
caps
<-
getNumCapabilities
let
n
=
1
`
max
`
(
length
ts
`
div
`
caps
)
concat
<$>
mapConcurrently
(
mapM
f
)
(
chunksOf
n
ts
)
data
CoocByYear
=
CoocByYear
{
year
::
Int
,
coocurrences
::
Map
(
Text
,
Text
)
Coocs
}
deriving
(
Show
,
Generic
)
data
CoocByYears
=
CoocByYears
{
years
::
[
CoocByYear
]
}
deriving
(
Show
,
Generic
)
instance
ToJSON
CoocByYear
instance
ToJSON
CoocByYears
------------------------------------------------------------------------
filterTermsAndCooc
::
TermType
Lang
::
Patterns
->
(
Int
,
[
Text
])
->
IO
(
Map
(
Terms
,
Terms
)
Coocs
)
->
IO
CoocByYear
-- (Int, (Map (Text, Text) Coocs)
)
filterTermsAndCooc
patterns
(
year
,
ts
)
=
do
log
"start"
r
<-
coocOn
identity
<$>
mapM
(
\
x
->
{-log "work" >>-}
terms
patterns
x
)
ts
r
<-
coocOn
Contexts
identity
<$>
mapM
(
\
x
->
{-log "work" >>-}
terms'
patterns
x
)
ts
log
"stop"
pure
r
pure
$
CoocByYear
year
(
DM
.
mapKeys
(
both
DT
.
unwords
)
r
)
where
log
m
=
do
tid
<-
myThreadId
(
p
,
_
)
<-
threadCapability
tid
putStrLn
.
unwords
$
[
"filterTermsAndCooc:"
,
m
,
show
year
,
"on proc"
,
show
p
]
--main :: IO [()]
main
::
IO
()
main
=
do
[
corpusFile
,
termListFile
,
_
]
<-
getArgs
[
corpusFile
,
termListFile
,
outputFile
]
<-
getArgs
--corpus :: IO (DM.IntMap [[Text]])
corpus
<-
DM
.
fromListWith
(
<>
)
...
...
@@ -102,18 +109,44 @@ main = do
putStrLn
$
show
$
length
termList
let
patterns
=
WithList
$
buildPatterns
termList
let
patterns
=
buildPatterns
termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r
<-
mapConcurrently
(
filterTermsAndCooc
patterns
)
(
DM
.
toList
corpus
)
putStrLn
$
show
r
--writeFile outputFile cooc
r
<-
mapConcurrently
(
filterTermsAndCooc
patterns
)
(
DM
.
toList
corpus
)
writeFile
outputFile
$
encode
(
CoocByYears
r
)
------------------------------------------------------------------------
-- | Tools
mapMP
::
MonadIO
m
=>
(
a
->
m
b
)
->
[
a
]
->
m
[
b
]
mapMP
f
xs
=
do
bs
<-
zipWithM
g
(
cycle
"-
\\
|/"
)
xs
liftIO
$
hPutStr
stderr
"
\r
Done
\n
"
pure
bs
where
g
c
x
=
do
liftIO
$
hPutStr
stderr
[
'
\r
'
,
c
]
liftIO
$
hFlush
stderr
f
x
-- | Optimi that need further developments (not used yet)
mapConcurrentlyChunked
::
(
a
->
IO
b
)
->
[
a
]
->
IO
[
b
]
mapConcurrentlyChunked
f
ts
=
do
caps
<-
getNumCapabilities
let
n
=
1
`
max
`
(
length
ts
`
div
`
caps
)
concat
<$>
mapConcurrently
(
mapM
f
)
(
chunksOf
n
ts
)
--terms' :: Patterns -> Text -> Corpus [[Text]]
terms'
pats
txt
=
pure
$
concat
$
extractTermsWithList
pats
txt
testCooc
=
do
let
patterns
=
WithList
$
buildPatterns
testTermList
mapM
(
\
x
->
{-log "work" >>-}
terms
patterns
x
)
$
catMaybes
$
map
(
head
.
snd
)
testCorpus
--mapConcurrently (filterTermsAndCooc patterns) testCorpus
-- | TODO Minimal Example
--testCooc = do
-- let patterns = buildPatterns testTermList
-- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
-- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
testCorpus
::
[(
Int
,
[
Text
])]
...
...
package.yaml
View file @
94a16978
...
...
@@ -160,8 +160,10 @@ executables:
-
-O2
-
-Wmissing-signatures
dependencies
:
-
aeson
-
async
-
base
-
bytestring
-
containers
-
gargantext
-
vector
...
...
src/Gargantext/Text/Metrics/Count.hs
View file @
94a16978
...
...
@@ -73,10 +73,10 @@ type Occs = Int
type
Coocs
=
Int
type
Threshold
=
Int
removeApax
::
Threshold
->
Map
(
Label
,
Label
)
Int
->
Map
(
Label
,
Label
)
Int
removeApax
::
Threshold
->
Map
(
[
Text
],
[
Text
])
Int
->
Map
([
Text
],
[
Text
]
)
Int
removeApax
t
=
DMS
.
filter
(
>
t
)
cooc
::
[[
Terms
]]
->
Map
(
Label
,
Label
)
Int
cooc
::
[[
Terms
]]
->
Map
(
[
Text
],
[
Text
]
)
Int
cooc
tss
=
coocOnWithLabel
_terms_stem
(
useLabelPolicy
label_policy
)
tss
where
terms_occs
=
occurrencesOn
_terms_stem
(
List
.
concat
tss
)
...
...
@@ -91,12 +91,12 @@ coocOnWithLabel on' policy tss = mapKeys (delta policy) $ coocOn on' tss
delta
f
=
f
***
f
mkLabelPolicy
::
Map
Grouped
(
Map
Terms
Occs
)
->
Map
Grouped
Label
mkLabelPolicy
::
Map
Grouped
(
Map
Terms
Occs
)
->
Map
Grouped
[
Text
]
mkLabelPolicy
=
DMS
.
map
f
where
f
=
_terms_label
.
fst
.
maximumWith
snd
.
DMS
.
toList
-- TODO use the Foldable instance of Map instead of building a list
useLabelPolicy
::
Map
Grouped
Label
->
Grouped
->
Label
useLabelPolicy
::
Map
Grouped
[
Text
]
->
Grouped
->
[
Text
]
useLabelPolicy
m
g
=
case
DMS
.
lookup
g
m
of
Just
label
->
label
Nothing
->
panic
$
"Label of Grouped not found: "
<>
(
pack
$
show
g
)
...
...
@@ -120,6 +120,8 @@ coocOn' fun ts = DMS.fromListWith (+) xs
,
x
>=
y
]
------------------------------------------------------------------------
coocOnContexts
::
(
a
->
[
Text
])
->
[[
a
]]
->
Map
([
Text
],
[
Text
])
Int
coocOnContexts
fun
=
DMS
.
fromListWith
(
+
)
.
List
.
concat
.
map
(
coocOnSingleContext
fun
)
...
...
@@ -132,6 +134,7 @@ coocOnSingleContext fun ts = xs
,
y
<-
ts'
,
x
>=
y
]
------------------------------------------------------------------------
-- | Compute the grouped occurrences (occ)
...
...
src/Gargantext/Text/Terms.hs
View file @
94a16978
...
...
@@ -33,7 +33,6 @@ compute graph
module
Gargantext.Text.Terms
where
import
Data.List
(
concat
)
import
Data.Text
(
Text
)
import
Data.Traversable
...
...
@@ -42,10 +41,9 @@ import Gargantext.Core
import
Gargantext.Core.Types
import
Gargantext.Text.Terms.Multi
(
multiterms
)
import
Gargantext.Text.Terms.Mono
(
monoTerms
)
import
Gargantext.Text.Terms.WithList
(
Patterns
,
extractTermsWithList
)
data
TermType
lang
=
Mono
lang
|
Multi
lang
|
MonoMulti
lang
|
WithList
Patterns
data
TermType
lang
=
Mono
lang
|
Multi
lang
|
MonoMulti
lang
group
::
[
Text
]
->
[
Text
]
...
...
@@ -68,6 +66,6 @@ terms :: TermType Lang -> Text -> IO [Terms]
terms
(
Mono
lang
)
txt
=
pure
$
monoTerms
lang
txt
terms
(
Multi
lang
)
txt
=
multiterms
lang
txt
terms
(
MonoMulti
lang
)
txt
=
terms
(
Multi
lang
)
txt
terms
(
WithList
list
)
txt
=
pure
.
concat
$
extractTermsWithList
list
txt
--
terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------
src/Gargantext/Text/Terms/WithList.hs
View file @
94a16978
...
...
@@ -21,7 +21,6 @@ import qualified Data.Algorithms.KMP as KMP
import
Data.Text
(
Text
)
import
qualified
Data.IntMap.Strict
as
IntMap
import
Gargantext.Core.Types
(
Terms
(
..
))
import
Gargantext.Text.Context
import
Gargantext.Text.Terms.Mono
(
monoTextsBySentence
)
...
...
@@ -29,21 +28,20 @@ import Prelude (error)
import
Gargantext.Prelude
import
Data.List
(
null
,
concatMap
)
import
Data.Ord
import
qualified
Data.Set
as
Set
------------------------------------------------------------------------
data
Pattern
=
Pattern
{
_pat_table
::
!
(
KMP
.
Table
Te
rm
)
{
_pat_table
::
!
(
KMP
.
Table
Te
xt
)
,
_pat_length
::
!
Int
,
_pat_terms
::
!
Terms
,
_pat_terms
::
!
[
Text
]
}
type
Patterns
=
[
Pattern
]
------------------------------------------------------------------------
replaceTerms
::
Patterns
->
Sentence
Term
->
Sentence
Terms
replaceTerms
::
Patterns
->
[
Text
]
->
[[
Text
]]
replaceTerms
pats
terms
=
go
0
where
terms_len
=
length
terms
...
...
@@ -72,8 +70,8 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
f
alt
|
""
`
elem
`
alt
=
error
"buildPatterns: ERR1"
|
null
alt
=
error
"buildPatterns: ERR2"
|
otherwise
=
Pattern
(
KMP
.
build
alt
)
(
length
alt
)
(
Terms
label
$
Set
.
empty
)
-- TODO check stems
Pattern
(
KMP
.
build
alt
)
(
length
alt
)
label
--
(Terms label $ Set.empty) -- TODO check stems
extractTermsWithList
::
Patterns
->
Text
->
Corpus
Terms
extractTermsWithList
::
Patterns
->
Text
->
Corpus
[
Text
]
extractTermsWithList
pats
=
map
(
replaceTerms
pats
)
.
monoTextsBySentence
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment