Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
160
Issues
160
List
Board
Labels
Milestones
Merge Requests
14
Merge Requests
14
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
f789b2e4
Commit
f789b2e4
authored
Jun 24, 2024
by
Alfredo Di Napoli
1
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
CLI: Add the filter-terms command
parent
0087a0f8
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
210 additions
and
120 deletions
+210
-120
CleanCsvCorpus.hs
bin/gargantext-cli/CLI/CleanCsvCorpus.hs
+1
-1
FilterTermsAndCooc.hs
bin/gargantext-cli/CLI/FilterTermsAndCooc.hs
+125
-0
Types.hs
bin/gargantext-cli/CLI/Types.hs
+25
-0
Utils.hs
bin/gargantext-cli/CLI/Utils.hs
+32
-0
Main.hs
bin/gargantext-cli/Main.hs
+22
-118
gargantext.cabal
gargantext.cabal
+5
-1
No files found.
bin/gargantext-cli/CleanCsvCorpus.hs
→
bin/gargantext-cli/C
LI/C
leanCsvCorpus.hs
View file @
f789b2e4
...
...
@@ -12,7 +12,7 @@ compress the contexts around the main terms of the query.
-}
module
CleanCsvCorpus
where
module
C
LI.C
leanCsvCorpus
where
import
Data.SearchEngine
qualified
as
S
import
Data.Set
qualified
as
S
...
...
bin/gargantext-cli/CLI/FilterTermsAndCooc.hs
0 → 100644
View file @
f789b2e4
module
CLI.FilterTermsAndCooc
(
filterTermsAndCoocCmd
,
filterTermsAndCoocCLI
-- * Testing functions
,
testCorpus
,
testTermList
)
where
import
CLI.Types
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Data.Aeson
(
encode
)
import
Data.Map.Strict
qualified
as
DM
import
Data.Text
(
pack
)
import
Data.Text
qualified
as
DT
import
Data.Text.Lazy
qualified
as
DTL
import
Data.Text.Lazy.Encoding
qualified
as
TLE
import
Data.Tuple.Extra
(
both
)
import
Data.Vector
qualified
as
DV
import
GHC.Generics
import
Gargantext.Core.Text.Context
(
TermList
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV
(
readTSVFile
,
tsv_title
,
tsv_abstract
,
tsv_publication_year
,
fromMIntOrDec
,
defaultYear
)
import
Gargantext.Core.Text.List.Formats.TSV
(
tsvMapTermList
)
import
Gargantext.Core.Text.Metrics.Count
(
coocOnContexts
,
Coocs
)
import
Gargantext.Core.Text.Terms.WithList
(
Patterns
,
buildPatterns
,
extractTermsWithList
)
import
Gargantext.Prelude
import
Options.Applicative
------------------------------------------------------------------------
-- OUTPUT format
data
CoocByYear
=
CoocByYear
{
year
::
Int
,
nbContexts
::
NbContexts
,
coocurrences
::
Map
(
Text
,
Text
)
Coocs
}
deriving
(
Show
,
Generic
)
data
CoocByYears
=
CoocByYears
{
years
::
[
CoocByYear
]
}
deriving
(
Show
,
Generic
)
type
NbContexts
=
Int
instance
ToJSON
CoocByYear
instance
ToJSON
CoocByYears
------------------------------------------------------------------------
filterTermsAndCoocCLI
::
CorpusFile
->
TermListFile
->
OutputFile
->
IO
()
filterTermsAndCoocCLI
(
CorpusFile
corpusFile
)
(
TermListFile
termListFile
)
(
OutputFile
outputFile
)
=
do
--corpus :: IO (DM.IntMap [[Text]])
eCorpusFile
<-
readTSVFile
corpusFile
case
eCorpusFile
of
Right
cf
->
do
let
corpus
=
DM
.
fromListWith
(
<>
)
.
DV
.
toList
.
DV
.
map
(
\
n
->
(
fromMIntOrDec
defaultYear
$
tsv_publication_year
n
,
[(
tsv_title
n
)
<>
" "
<>
(
tsv_abstract
n
)]))
.
snd
$
cf
-- termListMap :: [Text]
termList
<-
tsvMapTermList
termListFile
putText
$
show
$
length
termList
let
patterns
=
buildPatterns
termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r
<-
mapConcurrently
(
filterTermsAndCooc
patterns
)
(
DM
.
toList
corpus
)
writeFile
outputFile
$
DTL
.
toStrict
$
TLE
.
decodeUtf8
$
encode
(
CoocByYears
r
)
Left
e
->
panicTrace
$
"Error: "
<>
e
filterTermsAndCooc
::
Patterns
->
(
Int
,
[
Text
])
->
IO
CoocByYear
-- (Int, (Map (Text, Text) Coocs))
filterTermsAndCooc
patterns
(
year
,
ts
)
=
do
logWork
"start"
r
<-
coocOnContexts
identity
<$>
mapM
(
\
x
->
{-log "work" >>-}
terms'
patterns
x
)
ts
logWork
"stop"
pure
$
CoocByYear
year
(
length
ts
)
(
DM
.
mapKeys
(
both
DT
.
unwords
)
r
)
where
logWork
m
=
do
tid
<-
myThreadId
(
p
,
_
)
<-
threadCapability
tid
putText
.
unwords
$
[
"filterTermsAndCooc:"
,
m
,
show
year
,
"on proc"
,
show
p
]
--terms' :: Patterns -> Text -> Corpus [[Text]]
terms'
::
Applicative
f
=>
Patterns
->
Text
->
f
[[
Text
]]
terms'
pats
txt
=
pure
$
concat
$
extractTermsWithList
pats
txt
-- | TODO Minimal Example
--testCooc = do
-- let patterns = buildPatterns testTermList
-- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
-- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
testCorpus
::
[(
Int
,
[
Text
])]
testCorpus
=
[
(
1998
,
[
pack
"The beees"
])
,
(
1999
,
[
pack
"The bees and the flowers"
--, pack "The bees and the flowers"
])
]
testTermList
::
TermList
testTermList
=
[
([
pack
"bee"
],
[[
pack
"bees"
]])
,
([
pack
"flower"
],
[[
pack
"flowers"
]])
]
--
-- CLI API
--
filterTermsAndCoocCmd
::
HasCallStack
=>
Mod
CommandFields
CLI
filterTermsAndCoocCmd
=
command
"filter-terms"
(
info
(
helper
<*>
fmap
CLISub
filterTerms
)
(
progDesc
"Filter Terms and Cooc."
))
filterTerms
::
Parser
CLICmd
filterTerms
=
CCMD_filter_terms_and_cooc
<$>
(
option
str
(
long
"corpus-file"
<>
metavar
"FILE"
))
<*>
(
option
str
(
long
"terms-list-file"
<>
metavar
"FILE"
))
<*>
(
option
str
(
long
"output-file"
<>
metavar
"FILE"
))
bin/gargantext-cli/CLI/Types.hs
0 → 100644
View file @
f789b2e4
module
CLI.Types
where
import
Prelude
import
Data.String
newtype
CorpusFile
=
CorpusFile
{
_CorpusFile
::
FilePath
}
deriving
(
Show
,
Eq
,
IsString
)
newtype
TermListFile
=
TermListFile
{
_TermsListFile
::
FilePath
}
deriving
(
Show
,
Eq
,
IsString
)
newtype
OutputFile
=
OutputFile
{
_OutputFile
::
FilePath
}
deriving
(
Show
,
Eq
,
IsString
)
data
CLICmd
=
CCMD_clean_csv_corpus
|
CCMD_filter_terms_and_cooc
!
CorpusFile
!
TermListFile
!
OutputFile
deriving
(
Show
,
Eq
)
data
CLI
=
CLISub
CLICmd
deriving
(
Show
,
Eq
)
bin/gargantext-cli/CLI/Utils.hs
0 → 100644
View file @
f789b2e4
module
CLI.Utils
(
mapMP
,
mapConcurrentlyChunked
)
where
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Data.List.Split
(
chunksOf
)
import
Gargantext.Prelude
import
System.IO
(
hFlush
)
------------------------------------------------------------------------
-- | Tools
mapMP
::
MonadIO
m
=>
(
a
->
m
b
)
->
[
a
]
->
m
[
b
]
mapMP
f
xs
=
do
bs
<-
zipWithM
g
(
cycle
"-
\\
|/"
)
xs
liftIO
$
hPutStr
stderr
(
"
\r
Done
\n
"
::
Text
)
pure
bs
where
g
c
x
=
do
liftIO
$
hPutStr
stderr
[
'
\r
'
,
c
]
liftIO
$
hFlush
stderr
f
x
-- | Optimi that need further developments (not used yet)
mapConcurrentlyChunked
::
(
a
->
IO
b
)
->
[
a
]
->
IO
[
b
]
mapConcurrentlyChunked
f
ts
=
do
caps
<-
getNumCapabilities
let
n
=
1
`
max
`
(
length
ts
`
div
`
caps
)
concat
<$>
mapConcurrently
(
mapM
f
)
(
chunksOf
n
ts
)
bin/gargantext-cli/Main.hs
View file @
f789b2e4
...
...
@@ -12,129 +12,33 @@ Main specifications to index a corpus with a term list
-}
{-# LANGUAGE Strict #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE TypeApplications #-}
module
Main
where
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Data.Aeson
(
ToJSON
,
encode
)
import
Data.List.Split
(
chunksOf
)
import
Data.Map.Strict
qualified
as
DM
import
Data.Text
(
pack
)
import
Data.Text
qualified
as
DT
import
Data.Text.Lazy
qualified
as
DTL
import
Data.Text.Lazy.Encoding
qualified
as
TLE
import
Data.Tuple.Extra
(
both
)
import
Data.Vector
qualified
as
DV
import
Gargantext.Core.Text.Context
(
TermList
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV
(
readTSVFile
,
tsv_title
,
tsv_abstract
,
tsv_publication_year
,
fromMIntOrDec
,
defaultYear
)
import
Gargantext.Core.Text.List.Formats.TSV
(
tsvMapTermList
)
import
Gargantext.Core.Text.Metrics.Count
(
coocOnContexts
,
Coocs
)
import
Gargantext.Core.Text.Terms.WithList
(
Patterns
,
buildPatterns
,
extractTermsWithList
)
import
Gargantext.Prelude
import
System.IO
(
hFlush
)
import
Prelude
------------------------------------------------------------------------
-- OUTPUT format
import
CLI.Types
import
CLI.FilterTermsAndCooc
import
Options.Applicative
data
CoocByYear
=
CoocByYear
{
year
::
Int
,
nbContexts
::
NbContexts
,
coocurrences
::
Map
(
Text
,
Text
)
Coocs
}
deriving
(
Show
,
Generic
)
data
CoocByYears
=
CoocByYears
{
years
::
[
CoocByYear
]
}
deriving
(
Show
,
Generic
)
type
NbContexts
=
Int
instance
ToJSON
CoocByYear
instance
ToJSON
CoocByYears
------------------------------------------------------------------------
filterTermsAndCooc
::
Patterns
->
(
Int
,
[
Text
])
->
IO
CoocByYear
-- (Int, (Map (Text, Text) Coocs))
filterTermsAndCooc
patterns
(
year
,
ts
)
=
do
logWork
"start"
r
<-
coocOnContexts
identity
<$>
mapM
(
\
x
->
{-log "work" >>-}
terms'
patterns
x
)
ts
logWork
"stop"
pure
$
CoocByYear
year
(
length
ts
)
(
DM
.
mapKeys
(
both
DT
.
unwords
)
r
)
where
logWork
m
=
do
tid
<-
myThreadId
(
p
,
_
)
<-
threadCapability
tid
putText
.
unwords
$
[
"filterTermsAndCooc:"
,
m
,
show
year
,
"on proc"
,
show
p
]
runCLI
::
CLI
->
IO
()
runCLI
=
\
case
CLISub
CCMD_clean_csv_corpus
->
putStrLn
"TODO."
CLISub
(
CCMD_filter_terms_and_cooc
corpusFile
termListFile
outputFile
)
->
filterTermsAndCoocCLI
corpusFile
termListFile
outputFile
main
::
IO
()
main
=
do
[
corpusFile
,
termListFile
,
outputFile
]
<-
getArgs
--corpus :: IO (DM.IntMap [[Text]])
eCorpusFile
<-
readTSVFile
corpusFile
case
eCorpusFile
of
Right
cf
->
do
let
corpus
=
DM
.
fromListWith
(
<>
)
.
DV
.
toList
.
DV
.
map
(
\
n
->
(
fromMIntOrDec
defaultYear
$
tsv_publication_year
n
,
[(
tsv_title
n
)
<>
" "
<>
(
tsv_abstract
n
)]))
.
snd
$
cf
-- termListMap :: [Text]
termList
<-
tsvMapTermList
termListFile
putText
$
show
$
length
termList
let
patterns
=
buildPatterns
termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r
<-
mapConcurrently
(
filterTermsAndCooc
patterns
)
(
DM
.
toList
corpus
)
writeFile
outputFile
$
DTL
.
toStrict
$
TLE
.
decodeUtf8
$
encode
(
CoocByYears
r
)
Left
e
->
panicTrace
$
"Error: "
<>
e
------------------------------------------------------------------------
-- | Tools
mapMP
::
MonadIO
m
=>
(
a
->
m
b
)
->
[
a
]
->
m
[
b
]
mapMP
f
xs
=
do
bs
<-
zipWithM
g
(
cycle
"-
\\
|/"
)
xs
liftIO
$
hPutStr
stderr
(
"
\r
Done
\n
"
::
Text
)
pure
bs
main
=
runCLI
=<<
execParser
opts
where
g
c
x
=
do
liftIO
$
hPutStr
stderr
[
'
\r
'
,
c
]
liftIO
$
hFlush
stderr
f
x
-- | Optimi that need further developments (not used yet)
mapConcurrentlyChunked
::
(
a
->
IO
b
)
->
[
a
]
->
IO
[
b
]
mapConcurrentlyChunked
f
ts
=
do
caps
<-
getNumCapabilities
let
n
=
1
`
max
`
(
length
ts
`
div
`
caps
)
concat
<$>
mapConcurrently
(
mapM
f
)
(
chunksOf
n
ts
)
--terms' :: Patterns -> Text -> Corpus [[Text]]
terms'
::
Applicative
f
=>
Patterns
->
Text
->
f
[[
Text
]]
terms'
pats
txt
=
pure
$
concat
$
extractTermsWithList
pats
txt
-- | TODO Minimal Example
--testCooc = do
-- let patterns = buildPatterns testTermList
-- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
-- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
testCorpus
::
[(
Int
,
[
Text
])]
testCorpus
=
[
(
1998
,
[
pack
"The beees"
])
,
(
1999
,
[
pack
"The bees and the flowers"
--, pack "The bees and the flowers"
])
]
testTermList
::
TermList
testTermList
=
[
([
pack
"bee"
],
[[
pack
"bees"
]])
,
([
pack
"flower"
],
[[
pack
"flowers"
]])
]
opts
=
info
(
helper
<*>
allOptions
)
(
fullDesc
<>
progDesc
"CLI for the gargantext-server"
<>
header
"gargantext-cli tools"
)
allOptions
::
Parser
CLI
allOptions
=
subparser
(
filterTermsAndCoocCmd
)
gargantext.cabal
View file @
f789b2e4
...
...
@@ -710,7 +710,10 @@ executable gargantext-cli
, optimized
main-is: Main.hs
other-modules:
CleanCsvCorpus
CLI.CleanCsvCorpus
CLI.Types
CLI.FilterTermsAndCooc
CLI.Utils
Paths_gargantext
hs-source-dirs:
bin/gargantext-cli
...
...
@@ -726,6 +729,7 @@ executable gargantext-cli
, gargantext-prelude
, ini ^>= 0.4.1
, optparse-generic ^>= 1.4.7
, optparse-applicative
, protolude ^>= 0.3.3
, split ^>= 0.2.3.4
, text ^>= 1.2.4.1
...
...
Przemyslaw Kaminski
@cgenie
mentioned in commit
5660aec0
·
Oct 08, 2024
mentioned in commit
5660aec0
mentioned in commit 5660aec07ec5a0a0a5468f440092c1a8f57a864e
Toggle commit list
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment