Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Grégoire Locqueville
haskell-gargantext
Commits
f789b2e4
Commit
f789b2e4
authored
Jun 24, 2024
by
Alfredo Di Napoli
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
CLI: Add the filter-terms command
parent
0087a0f8
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
210 additions
and
120 deletions
+210
-120
CleanCsvCorpus.hs
bin/gargantext-cli/CLI/CleanCsvCorpus.hs
+1
-1
FilterTermsAndCooc.hs
bin/gargantext-cli/CLI/FilterTermsAndCooc.hs
+125
-0
Types.hs
bin/gargantext-cli/CLI/Types.hs
+25
-0
Utils.hs
bin/gargantext-cli/CLI/Utils.hs
+32
-0
Main.hs
bin/gargantext-cli/Main.hs
+22
-118
gargantext.cabal
gargantext.cabal
+5
-1
No files found.
bin/gargantext-cli/CleanCsvCorpus.hs
→
bin/gargantext-cli/C
LI/C
leanCsvCorpus.hs
View file @
f789b2e4
...
...
@@ -12,7 +12,7 @@ compress the contexts around the main terms of the query.
-}
module
CleanCsvCorpus
where
module
C
LI.C
leanCsvCorpus
where
import
Data.SearchEngine
qualified
as
S
import
Data.Set
qualified
as
S
...
...
bin/gargantext-cli/CLI/FilterTermsAndCooc.hs
0 → 100644
View file @
f789b2e4
module
CLI.FilterTermsAndCooc
(
filterTermsAndCoocCmd
,
filterTermsAndCoocCLI
-- * Testing functions
,
testCorpus
,
testTermList
)
where
import
CLI.Types
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Data.Aeson
(
encode
)
import
Data.Map.Strict
qualified
as
DM
import
Data.Text
(
pack
)
import
Data.Text
qualified
as
DT
import
Data.Text.Lazy
qualified
as
DTL
import
Data.Text.Lazy.Encoding
qualified
as
TLE
import
Data.Tuple.Extra
(
both
)
import
Data.Vector
qualified
as
DV
import
GHC.Generics
import
Gargantext.Core.Text.Context
(
TermList
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV
(
readTSVFile
,
tsv_title
,
tsv_abstract
,
tsv_publication_year
,
fromMIntOrDec
,
defaultYear
)
import
Gargantext.Core.Text.List.Formats.TSV
(
tsvMapTermList
)
import
Gargantext.Core.Text.Metrics.Count
(
coocOnContexts
,
Coocs
)
import
Gargantext.Core.Text.Terms.WithList
(
Patterns
,
buildPatterns
,
extractTermsWithList
)
import
Gargantext.Prelude
import
Options.Applicative
------------------------------------------------------------------------
-- OUTPUT format
data
CoocByYear
=
CoocByYear
{
year
::
Int
,
nbContexts
::
NbContexts
,
coocurrences
::
Map
(
Text
,
Text
)
Coocs
}
deriving
(
Show
,
Generic
)
data
CoocByYears
=
CoocByYears
{
years
::
[
CoocByYear
]
}
deriving
(
Show
,
Generic
)
type
NbContexts
=
Int
instance
ToJSON
CoocByYear
instance
ToJSON
CoocByYears
------------------------------------------------------------------------
filterTermsAndCoocCLI
::
CorpusFile
->
TermListFile
->
OutputFile
->
IO
()
filterTermsAndCoocCLI
(
CorpusFile
corpusFile
)
(
TermListFile
termListFile
)
(
OutputFile
outputFile
)
=
do
--corpus :: IO (DM.IntMap [[Text]])
eCorpusFile
<-
readTSVFile
corpusFile
case
eCorpusFile
of
Right
cf
->
do
let
corpus
=
DM
.
fromListWith
(
<>
)
.
DV
.
toList
.
DV
.
map
(
\
n
->
(
fromMIntOrDec
defaultYear
$
tsv_publication_year
n
,
[(
tsv_title
n
)
<>
" "
<>
(
tsv_abstract
n
)]))
.
snd
$
cf
-- termListMap :: [Text]
termList
<-
tsvMapTermList
termListFile
putText
$
show
$
length
termList
let
patterns
=
buildPatterns
termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r
<-
mapConcurrently
(
filterTermsAndCooc
patterns
)
(
DM
.
toList
corpus
)
writeFile
outputFile
$
DTL
.
toStrict
$
TLE
.
decodeUtf8
$
encode
(
CoocByYears
r
)
Left
e
->
panicTrace
$
"Error: "
<>
e
filterTermsAndCooc
::
Patterns
->
(
Int
,
[
Text
])
->
IO
CoocByYear
-- (Int, (Map (Text, Text) Coocs))
filterTermsAndCooc
patterns
(
year
,
ts
)
=
do
logWork
"start"
r
<-
coocOnContexts
identity
<$>
mapM
(
\
x
->
{-log "work" >>-}
terms'
patterns
x
)
ts
logWork
"stop"
pure
$
CoocByYear
year
(
length
ts
)
(
DM
.
mapKeys
(
both
DT
.
unwords
)
r
)
where
logWork
m
=
do
tid
<-
myThreadId
(
p
,
_
)
<-
threadCapability
tid
putText
.
unwords
$
[
"filterTermsAndCooc:"
,
m
,
show
year
,
"on proc"
,
show
p
]
--terms' :: Patterns -> Text -> Corpus [[Text]]
terms'
::
Applicative
f
=>
Patterns
->
Text
->
f
[[
Text
]]
terms'
pats
txt
=
pure
$
concat
$
extractTermsWithList
pats
txt
-- | TODO Minimal Example
--testCooc = do
-- let patterns = buildPatterns testTermList
-- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
-- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
testCorpus
::
[(
Int
,
[
Text
])]
testCorpus
=
[
(
1998
,
[
pack
"The beees"
])
,
(
1999
,
[
pack
"The bees and the flowers"
--, pack "The bees and the flowers"
])
]
testTermList
::
TermList
testTermList
=
[
([
pack
"bee"
],
[[
pack
"bees"
]])
,
([
pack
"flower"
],
[[
pack
"flowers"
]])
]
--
-- CLI API
--
filterTermsAndCoocCmd
::
HasCallStack
=>
Mod
CommandFields
CLI
filterTermsAndCoocCmd
=
command
"filter-terms"
(
info
(
helper
<*>
fmap
CLISub
filterTerms
)
(
progDesc
"Filter Terms and Cooc."
))
filterTerms
::
Parser
CLICmd
filterTerms
=
CCMD_filter_terms_and_cooc
<$>
(
option
str
(
long
"corpus-file"
<>
metavar
"FILE"
))
<*>
(
option
str
(
long
"terms-list-file"
<>
metavar
"FILE"
))
<*>
(
option
str
(
long
"output-file"
<>
metavar
"FILE"
))
bin/gargantext-cli/CLI/Types.hs
0 → 100644
View file @
f789b2e4
module
CLI.Types
where
import
Prelude
import
Data.String
newtype
CorpusFile
=
CorpusFile
{
_CorpusFile
::
FilePath
}
deriving
(
Show
,
Eq
,
IsString
)
newtype
TermListFile
=
TermListFile
{
_TermsListFile
::
FilePath
}
deriving
(
Show
,
Eq
,
IsString
)
newtype
OutputFile
=
OutputFile
{
_OutputFile
::
FilePath
}
deriving
(
Show
,
Eq
,
IsString
)
data
CLICmd
=
CCMD_clean_csv_corpus
|
CCMD_filter_terms_and_cooc
!
CorpusFile
!
TermListFile
!
OutputFile
deriving
(
Show
,
Eq
)
data
CLI
=
CLISub
CLICmd
deriving
(
Show
,
Eq
)
bin/gargantext-cli/CLI/Utils.hs
0 → 100644
View file @
f789b2e4
module
CLI.Utils
(
mapMP
,
mapConcurrentlyChunked
)
where
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Data.List.Split
(
chunksOf
)
import
Gargantext.Prelude
import
System.IO
(
hFlush
)
------------------------------------------------------------------------
-- | Tools
mapMP
::
MonadIO
m
=>
(
a
->
m
b
)
->
[
a
]
->
m
[
b
]
mapMP
f
xs
=
do
bs
<-
zipWithM
g
(
cycle
"-
\\
|/"
)
xs
liftIO
$
hPutStr
stderr
(
"
\r
Done
\n
"
::
Text
)
pure
bs
where
g
c
x
=
do
liftIO
$
hPutStr
stderr
[
'
\r
'
,
c
]
liftIO
$
hFlush
stderr
f
x
-- | Optimi that need further developments (not used yet)
mapConcurrentlyChunked
::
(
a
->
IO
b
)
->
[
a
]
->
IO
[
b
]
mapConcurrentlyChunked
f
ts
=
do
caps
<-
getNumCapabilities
let
n
=
1
`
max
`
(
length
ts
`
div
`
caps
)
concat
<$>
mapConcurrently
(
mapM
f
)
(
chunksOf
n
ts
)
bin/gargantext-cli/Main.hs
View file @
f789b2e4
...
...
@@ -12,129 +12,33 @@ Main specifications to index a corpus with a term list
-}
{-# LANGUAGE Strict #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE TypeApplications #-}
module
Main
where
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Data.Aeson
(
ToJSON
,
encode
)
import
Data.List.Split
(
chunksOf
)
import
Data.Map.Strict
qualified
as
DM
import
Data.Text
(
pack
)
import
Data.Text
qualified
as
DT
import
Data.Text.Lazy
qualified
as
DTL
import
Data.Text.Lazy.Encoding
qualified
as
TLE
import
Data.Tuple.Extra
(
both
)
import
Data.Vector
qualified
as
DV
import
Gargantext.Core.Text.Context
(
TermList
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV
(
readTSVFile
,
tsv_title
,
tsv_abstract
,
tsv_publication_year
,
fromMIntOrDec
,
defaultYear
)
import
Gargantext.Core.Text.List.Formats.TSV
(
tsvMapTermList
)
import
Gargantext.Core.Text.Metrics.Count
(
coocOnContexts
,
Coocs
)
import
Gargantext.Core.Text.Terms.WithList
(
Patterns
,
buildPatterns
,
extractTermsWithList
)
import
Gargantext.Prelude
import
System.IO
(
hFlush
)
import
Prelude
------------------------------------------------------------------------
-- OUTPUT format
import
CLI.Types
import
CLI.FilterTermsAndCooc
import
Options.Applicative
data
CoocByYear
=
CoocByYear
{
year
::
Int
,
nbContexts
::
NbContexts
,
coocurrences
::
Map
(
Text
,
Text
)
Coocs
}
deriving
(
Show
,
Generic
)
data
CoocByYears
=
CoocByYears
{
years
::
[
CoocByYear
]
}
deriving
(
Show
,
Generic
)
type
NbContexts
=
Int
instance
ToJSON
CoocByYear
instance
ToJSON
CoocByYears
------------------------------------------------------------------------
filterTermsAndCooc
::
Patterns
->
(
Int
,
[
Text
])
->
IO
CoocByYear
-- (Int, (Map (Text, Text) Coocs))
filterTermsAndCooc
patterns
(
year
,
ts
)
=
do
logWork
"start"
r
<-
coocOnContexts
identity
<$>
mapM
(
\
x
->
{-log "work" >>-}
terms'
patterns
x
)
ts
logWork
"stop"
pure
$
CoocByYear
year
(
length
ts
)
(
DM
.
mapKeys
(
both
DT
.
unwords
)
r
)
where
logWork
m
=
do
tid
<-
myThreadId
(
p
,
_
)
<-
threadCapability
tid
putText
.
unwords
$
[
"filterTermsAndCooc:"
,
m
,
show
year
,
"on proc"
,
show
p
]
runCLI
::
CLI
->
IO
()
runCLI
=
\
case
CLISub
CCMD_clean_csv_corpus
->
putStrLn
"TODO."
CLISub
(
CCMD_filter_terms_and_cooc
corpusFile
termListFile
outputFile
)
->
filterTermsAndCoocCLI
corpusFile
termListFile
outputFile
main
::
IO
()
main
=
do
[
corpusFile
,
termListFile
,
outputFile
]
<-
getArgs
--corpus :: IO (DM.IntMap [[Text]])
eCorpusFile
<-
readTSVFile
corpusFile
case
eCorpusFile
of
Right
cf
->
do
let
corpus
=
DM
.
fromListWith
(
<>
)
.
DV
.
toList
.
DV
.
map
(
\
n
->
(
fromMIntOrDec
defaultYear
$
tsv_publication_year
n
,
[(
tsv_title
n
)
<>
" "
<>
(
tsv_abstract
n
)]))
.
snd
$
cf
-- termListMap :: [Text]
termList
<-
tsvMapTermList
termListFile
putText
$
show
$
length
termList
let
patterns
=
buildPatterns
termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r
<-
mapConcurrently
(
filterTermsAndCooc
patterns
)
(
DM
.
toList
corpus
)
writeFile
outputFile
$
DTL
.
toStrict
$
TLE
.
decodeUtf8
$
encode
(
CoocByYears
r
)
Left
e
->
panicTrace
$
"Error: "
<>
e
------------------------------------------------------------------------
-- | Tools
mapMP
::
MonadIO
m
=>
(
a
->
m
b
)
->
[
a
]
->
m
[
b
]
mapMP
f
xs
=
do
bs
<-
zipWithM
g
(
cycle
"-
\\
|/"
)
xs
liftIO
$
hPutStr
stderr
(
"
\r
Done
\n
"
::
Text
)
pure
bs
main
=
runCLI
=<<
execParser
opts
where
g
c
x
=
do
liftIO
$
hPutStr
stderr
[
'
\r
'
,
c
]
liftIO
$
hFlush
stderr
f
x
-- | Optimi that need further developments (not used yet)
mapConcurrentlyChunked
::
(
a
->
IO
b
)
->
[
a
]
->
IO
[
b
]
mapConcurrentlyChunked
f
ts
=
do
caps
<-
getNumCapabilities
let
n
=
1
`
max
`
(
length
ts
`
div
`
caps
)
concat
<$>
mapConcurrently
(
mapM
f
)
(
chunksOf
n
ts
)
--terms' :: Patterns -> Text -> Corpus [[Text]]
terms'
::
Applicative
f
=>
Patterns
->
Text
->
f
[[
Text
]]
terms'
pats
txt
=
pure
$
concat
$
extractTermsWithList
pats
txt
-- | TODO Minimal Example
--testCooc = do
-- let patterns = buildPatterns testTermList
-- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
-- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
testCorpus
::
[(
Int
,
[
Text
])]
testCorpus
=
[
(
1998
,
[
pack
"The beees"
])
,
(
1999
,
[
pack
"The bees and the flowers"
--, pack "The bees and the flowers"
])
]
testTermList
::
TermList
testTermList
=
[
([
pack
"bee"
],
[[
pack
"bees"
]])
,
([
pack
"flower"
],
[[
pack
"flowers"
]])
]
opts
=
info
(
helper
<*>
allOptions
)
(
fullDesc
<>
progDesc
"CLI for the gargantext-server"
<>
header
"gargantext-cli tools"
)
allOptions
::
Parser
CLI
allOptions
=
subparser
(
filterTermsAndCoocCmd
)
gargantext.cabal
View file @
f789b2e4
...
...
@@ -710,7 +710,10 @@ executable gargantext-cli
, optimized
main-is: Main.hs
other-modules:
CleanCsvCorpus
CLI.CleanCsvCorpus
CLI.Types
CLI.FilterTermsAndCooc
CLI.Utils
Paths_gargantext
hs-source-dirs:
bin/gargantext-cli
...
...
@@ -726,6 +729,7 @@ executable gargantext-cli
, gargantext-prelude
, ini ^>= 0.4.1
, optparse-generic ^>= 1.4.7
, optparse-applicative
, protolude ^>= 0.3.3
, split ^>= 0.2.3.4
, text ^>= 1.2.4.1
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment