Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Grégoire Locqueville
haskell-gargantext
Commits
19c7c2db
Commit
19c7c2db
authored
Jun 25, 2024
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/adinapoli/issue-355' into dev
parents
5589b7ae
7c1bc974
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
243 additions
and
172 deletions
+243
-172
CleanCsvCorpus.hs
bin/gargantext-cli/CLI/CleanCsvCorpus.hs
+1
-1
FilterTermsAndCooc.hs
bin/gargantext-cli/CLI/FilterTermsAndCooc.hs
+125
-0
ObfuscateDB.hs
bin/gargantext-cli/CLI/ObfuscateDB.hs
+14
-27
Types.hs
bin/gargantext-cli/CLI/Types.hs
+36
-0
Utils.hs
bin/gargantext-cli/CLI/Utils.hs
+32
-0
Main.hs
bin/gargantext-cli/Main.hs
+26
-118
gargantext.cabal
gargantext.cabal
+9
-26
No files found.
bin/gargantext-cli/CleanCsvCorpus.hs
→
bin/gargantext-cli/C
LI/C
leanCsvCorpus.hs
View file @
19c7c2db
...
...
@@ -12,7 +12,7 @@ compress the contexts around the main terms of the query.
-}
module
CleanCsvCorpus
where
module
C
LI.C
leanCsvCorpus
where
import
Data.SearchEngine
qualified
as
S
import
Data.Set
qualified
as
S
...
...
bin/gargantext-cli/CLI/FilterTermsAndCooc.hs
0 → 100644
View file @
19c7c2db
module
CLI.FilterTermsAndCooc
(
filterTermsAndCoocCmd
,
filterTermsAndCoocCLI
-- * Testing functions
,
testCorpus
,
testTermList
)
where
import
CLI.Types
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Data.Aeson
(
encode
)
import
Data.Map.Strict
qualified
as
DM
import
Data.Text
(
pack
)
import
Data.Text
qualified
as
DT
import
Data.Text.Lazy
qualified
as
DTL
import
Data.Text.Lazy.Encoding
qualified
as
TLE
import
Data.Tuple.Extra
(
both
)
import
Data.Vector
qualified
as
DV
import
GHC.Generics
import
Gargantext.Core.Text.Context
(
TermList
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV
(
readTSVFile
,
tsv_title
,
tsv_abstract
,
tsv_publication_year
,
fromMIntOrDec
,
defaultYear
)
import
Gargantext.Core.Text.List.Formats.TSV
(
tsvMapTermList
)
import
Gargantext.Core.Text.Metrics.Count
(
coocOnContexts
,
Coocs
)
import
Gargantext.Core.Text.Terms.WithList
(
Patterns
,
buildPatterns
,
extractTermsWithList
)
import
Gargantext.Prelude
import
Options.Applicative
------------------------------------------------------------------------
-- OUTPUT format
data
CoocByYear
=
CoocByYear
{
year
::
Int
,
nbContexts
::
NbContexts
,
coocurrences
::
Map
(
Text
,
Text
)
Coocs
}
deriving
(
Show
,
Generic
)
data
CoocByYears
=
CoocByYears
{
years
::
[
CoocByYear
]
}
deriving
(
Show
,
Generic
)
type
NbContexts
=
Int
instance
ToJSON
CoocByYear
instance
ToJSON
CoocByYears
------------------------------------------------------------------------
filterTermsAndCoocCLI
::
CorpusFile
->
TermListFile
->
OutputFile
->
IO
()
filterTermsAndCoocCLI
(
CorpusFile
corpusFile
)
(
TermListFile
termListFile
)
(
OutputFile
outputFile
)
=
do
--corpus :: IO (DM.IntMap [[Text]])
eCorpusFile
<-
readTSVFile
corpusFile
case
eCorpusFile
of
Right
cf
->
do
let
corpus
=
DM
.
fromListWith
(
<>
)
.
DV
.
toList
.
DV
.
map
(
\
n
->
(
fromMIntOrDec
defaultYear
$
tsv_publication_year
n
,
[(
tsv_title
n
)
<>
" "
<>
(
tsv_abstract
n
)]))
.
snd
$
cf
-- termListMap :: [Text]
termList
<-
tsvMapTermList
termListFile
putText
$
show
$
length
termList
let
patterns
=
buildPatterns
termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r
<-
mapConcurrently
(
filterTermsAndCooc
patterns
)
(
DM
.
toList
corpus
)
writeFile
outputFile
$
DTL
.
toStrict
$
TLE
.
decodeUtf8
$
encode
(
CoocByYears
r
)
Left
e
->
panicTrace
$
"Error: "
<>
e
filterTermsAndCooc
::
Patterns
->
(
Int
,
[
Text
])
->
IO
CoocByYear
-- (Int, (Map (Text, Text) Coocs))
filterTermsAndCooc
patterns
(
year
,
ts
)
=
do
logWork
"start"
r
<-
coocOnContexts
identity
<$>
mapM
(
\
x
->
{-log "work" >>-}
terms'
patterns
x
)
ts
logWork
"stop"
pure
$
CoocByYear
year
(
length
ts
)
(
DM
.
mapKeys
(
both
DT
.
unwords
)
r
)
where
logWork
m
=
do
tid
<-
myThreadId
(
p
,
_
)
<-
threadCapability
tid
putText
.
unwords
$
[
"filterTermsAndCooc:"
,
m
,
show
year
,
"on proc"
,
show
p
]
--terms' :: Patterns -> Text -> Corpus [[Text]]
terms'
::
Applicative
f
=>
Patterns
->
Text
->
f
[[
Text
]]
terms'
pats
txt
=
pure
$
concat
$
extractTermsWithList
pats
txt
-- | TODO Minimal Example
--testCooc = do
-- let patterns = buildPatterns testTermList
-- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
-- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
testCorpus
::
[(
Int
,
[
Text
])]
testCorpus
=
[
(
1998
,
[
pack
"The beees"
])
,
(
1999
,
[
pack
"The bees and the flowers"
--, pack "The bees and the flowers"
])
]
testTermList
::
TermList
testTermList
=
[
([
pack
"bee"
],
[[
pack
"bees"
]])
,
([
pack
"flower"
],
[[
pack
"flowers"
]])
]
--
-- CLI API
--
filterTermsAndCoocCmd
::
HasCallStack
=>
Mod
CommandFields
CLI
filterTermsAndCoocCmd
=
command
"filter-terms"
(
info
(
helper
<*>
fmap
CLISub
filterTerms
)
(
progDesc
"Filter Terms and Cooc."
))
filterTerms
::
Parser
CLICmd
filterTerms
=
CCMD_filter_terms_and_cooc
<$>
(
option
str
(
long
"corpus-file"
<>
metavar
"FILE"
))
<*>
(
option
str
(
long
"terms-list-file"
<>
metavar
"FILE"
))
<*>
(
option
str
(
long
"output-file"
<>
metavar
"FILE"
))
bin/gargantext-
db-obfuscation/Main
.hs
→
bin/gargantext-
cli/CLI/ObfuscateDB
.hs
View file @
19c7c2db
...
...
@@ -28,8 +28,10 @@ https://stackoverflow.com/questions/876522/creating-a-copy-of-a-database-in-post
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE Strict #-}
module
Main
where
module
CLI.ObfuscateDB
(
obfuscateDB
,
obfuscateDBCmd
)
where
import
Data.Text
qualified
as
T
import
Database.PostgreSQL.Simple
qualified
as
PSQL
...
...
@@ -37,23 +39,16 @@ import Database.PostgreSQL.Simple.SqlQQ (sql)
import
Gargantext.Core
(
toDBid
)
import
Gargantext.Database.Admin.Config
()
import
Gargantext.Database.Admin.Types.Node
(
NodeType
(
..
))
import
Gargantext.Prelude
hiding
(
option
)
import
Gargantext.Prelude
import
Gargantext.Prelude.Database
(
runPGSExecute
,
runPGSQuery
)
import
Options.Applicative.Simple
import
CLI.Types
import
Options.Applicative
data
Args
=
Args
{
dbHost
::
Text
,
dbPort
::
Int
,
dbName
::
Text
,
dbUser
::
Text
,
dbPassword
::
Text
}
deriving
(
Show
,
Eq
)
obfuscateDBCmd
::
HasCallStack
=>
Mod
CommandFields
CLI
obfuscateDBCmd
=
command
"obfuscate-db"
(
info
(
helper
<*>
fmap
CLISub
obfuscateDB_p
)
(
progDesc
"Obfuscate a cloned Gargantext DB."
))
args
::
Parser
Args
args
=
Args
obfuscateDB_p
::
Parser
CLICmd
obfuscateDB_p
=
fmap
CCMD_obfuscate_db
$
ObfuscateDBArgs
<$>
(
strOption
(
long
"db-host"
<>
metavar
"db-host"
<>
help
"Location of the DB server"
...
...
@@ -71,17 +66,9 @@ args = Args
<*>
(
strOption
(
long
"db-password"
<>
metavar
"db-password"
<>
value
""
))
main
::
IO
()
main
=
do
(
opts
,
()
)
<-
simpleOptions
"0.0.1"
"gargantext DB obfuscation"
"Obfuscates a cloned Gargantext DB"
args
empty
obfuscateDB
::
ObfuscateDBArgs
->
IO
()
obfuscateDB
opts
=
do
putText
$
show
opts
let
ci
=
PSQL
.
ConnectInfo
{
connectHost
=
T
.
unpack
$
dbHost
opts
...
...
@@ -101,7 +88,7 @@ main = do
obfuscateNotes
::
PSQL
.
Connection
->
IO
()
obfuscateNotes
c
=
do
let
nt
=
toDBid
Notes
_
<-
runPGSExecute
c
[
sql
|
UPDATE nodes SET name = concat('notes-', id) WHERE typename = ?;
|]
(
PSQL
.
Only
nt
)
nsNew
<-
runPGSQuery
c
[
sql
|
SELECT id, name FROM nodes WHERE typename = ?
|]
(
PSQL
.
Only
nt
)
::
IO
[(
Int
,
Text
)]
...
...
bin/gargantext-cli/CLI/Types.hs
0 → 100644
View file @
19c7c2db
module
CLI.Types
where
import
Prelude
import
Data.String
import
Data.Text
(
Text
)
newtype
CorpusFile
=
CorpusFile
{
_CorpusFile
::
FilePath
}
deriving
(
Show
,
Eq
,
IsString
)
newtype
TermListFile
=
TermListFile
{
_TermsListFile
::
FilePath
}
deriving
(
Show
,
Eq
,
IsString
)
newtype
OutputFile
=
OutputFile
{
_OutputFile
::
FilePath
}
deriving
(
Show
,
Eq
,
IsString
)
data
ObfuscateDBArgs
=
ObfuscateDBArgs
{
dbHost
::
!
Text
,
dbPort
::
!
Int
,
dbName
::
!
Text
,
dbUser
::
!
Text
,
dbPassword
::
!
Text
}
deriving
(
Show
,
Eq
)
data
CLICmd
=
CCMD_clean_csv_corpus
|
CCMD_filter_terms_and_cooc
!
CorpusFile
!
TermListFile
!
OutputFile
|
CCMD_obfuscate_db
!
ObfuscateDBArgs
deriving
(
Show
,
Eq
)
data
CLI
=
CLISub
CLICmd
deriving
(
Show
,
Eq
)
bin/gargantext-cli/CLI/Utils.hs
0 → 100644
View file @
19c7c2db
module
CLI.Utils
(
mapMP
,
mapConcurrentlyChunked
)
where
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Data.List.Split
(
chunksOf
)
import
Gargantext.Prelude
import
System.IO
(
hFlush
)
------------------------------------------------------------------------
-- | Tools
mapMP
::
MonadIO
m
=>
(
a
->
m
b
)
->
[
a
]
->
m
[
b
]
mapMP
f
xs
=
do
bs
<-
zipWithM
g
(
cycle
"-
\\
|/"
)
xs
liftIO
$
hPutStr
stderr
(
"
\r
Done
\n
"
::
Text
)
pure
bs
where
g
c
x
=
do
liftIO
$
hPutStr
stderr
[
'
\r
'
,
c
]
liftIO
$
hFlush
stderr
f
x
-- | Optimi that need further developments (not used yet)
mapConcurrentlyChunked
::
(
a
->
IO
b
)
->
[
a
]
->
IO
[
b
]
mapConcurrentlyChunked
f
ts
=
do
caps
<-
getNumCapabilities
let
n
=
1
`
max
`
(
length
ts
`
div
`
caps
)
concat
<$>
mapConcurrently
(
mapM
f
)
(
chunksOf
n
ts
)
bin/gargantext-cli/Main.hs
View file @
19c7c2db
...
...
@@ -12,129 +12,37 @@ Main specifications to index a corpus with a term list
-}
{-# LANGUAGE Strict #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE TypeApplications #-}
module
Main
where
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Data.Aeson
(
ToJSON
,
encode
)
import
Data.List.Split
(
chunksOf
)
import
Data.Map.Strict
qualified
as
DM
import
Data.Text
(
pack
)
import
Data.Text
qualified
as
DT
import
Data.Text.Lazy
qualified
as
DTL
import
Data.Text.Lazy.Encoding
qualified
as
TLE
import
Data.Tuple.Extra
(
both
)
import
Data.Vector
qualified
as
DV
import
Gargantext.Core.Text.Context
(
TermList
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV
(
readTSVFile
,
tsv_title
,
tsv_abstract
,
tsv_publication_year
,
fromMIntOrDec
,
defaultYear
)
import
Gargantext.Core.Text.List.Formats.TSV
(
tsvMapTermList
)
import
Gargantext.Core.Text.Metrics.Count
(
coocOnContexts
,
Coocs
)
import
Gargantext.Core.Text.Terms.WithList
(
Patterns
,
buildPatterns
,
extractTermsWithList
)
import
Gargantext.Prelude
import
System.IO
(
hFlush
)
import
Prelude
------------------------------------------------------------------------
-- OUTPUT format
import
CLI.FilterTermsAndCooc
import
CLI.ObfuscateDB
(
obfuscateDB
,
obfuscateDBCmd
)
import
CLI.Types
import
Options.Applicative
data
CoocByYear
=
CoocByYear
{
year
::
Int
,
nbContexts
::
NbContexts
,
coocurrences
::
Map
(
Text
,
Text
)
Coocs
}
deriving
(
Show
,
Generic
)
data
CoocByYears
=
CoocByYears
{
years
::
[
CoocByYear
]
}
deriving
(
Show
,
Generic
)
type
NbContexts
=
Int
instance
ToJSON
CoocByYear
instance
ToJSON
CoocByYears
------------------------------------------------------------------------
filterTermsAndCooc
::
Patterns
->
(
Int
,
[
Text
])
->
IO
CoocByYear
-- (Int, (Map (Text, Text) Coocs))
filterTermsAndCooc
patterns
(
year
,
ts
)
=
do
logWork
"start"
r
<-
coocOnContexts
identity
<$>
mapM
(
\
x
->
{-log "work" >>-}
terms'
patterns
x
)
ts
logWork
"stop"
pure
$
CoocByYear
year
(
length
ts
)
(
DM
.
mapKeys
(
both
DT
.
unwords
)
r
)
where
logWork
m
=
do
tid
<-
myThreadId
(
p
,
_
)
<-
threadCapability
tid
putText
.
unwords
$
[
"filterTermsAndCooc:"
,
m
,
show
year
,
"on proc"
,
show
p
]
runCLI
::
CLI
->
IO
()
runCLI
=
\
case
CLISub
CCMD_clean_csv_corpus
->
putStrLn
"TODO."
CLISub
(
CCMD_filter_terms_and_cooc
corpusFile
termListFile
outputFile
)
->
filterTermsAndCoocCLI
corpusFile
termListFile
outputFile
CLISub
(
CCMD_obfuscate_db
args
)
->
obfuscateDB
args
main
::
IO
()
main
=
do
[
corpusFile
,
termListFile
,
outputFile
]
<-
getArgs
--corpus :: IO (DM.IntMap [[Text]])
eCorpusFile
<-
readTSVFile
corpusFile
case
eCorpusFile
of
Right
cf
->
do
let
corpus
=
DM
.
fromListWith
(
<>
)
.
DV
.
toList
.
DV
.
map
(
\
n
->
(
fromMIntOrDec
defaultYear
$
tsv_publication_year
n
,
[(
tsv_title
n
)
<>
" "
<>
(
tsv_abstract
n
)]))
.
snd
$
cf
-- termListMap :: [Text]
termList
<-
tsvMapTermList
termListFile
putText
$
show
$
length
termList
let
patterns
=
buildPatterns
termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r
<-
mapConcurrently
(
filterTermsAndCooc
patterns
)
(
DM
.
toList
corpus
)
writeFile
outputFile
$
DTL
.
toStrict
$
TLE
.
decodeUtf8
$
encode
(
CoocByYears
r
)
Left
e
->
panicTrace
$
"Error: "
<>
e
------------------------------------------------------------------------
-- | Tools
mapMP
::
MonadIO
m
=>
(
a
->
m
b
)
->
[
a
]
->
m
[
b
]
mapMP
f
xs
=
do
bs
<-
zipWithM
g
(
cycle
"-
\\
|/"
)
xs
liftIO
$
hPutStr
stderr
(
"
\r
Done
\n
"
::
Text
)
pure
bs
main
=
runCLI
=<<
execParser
opts
where
g
c
x
=
do
liftIO
$
hPutStr
stderr
[
'
\r
'
,
c
]
liftIO
$
hFlush
stderr
f
x
-- | Optimi that need further developments (not used yet)
mapConcurrentlyChunked
::
(
a
->
IO
b
)
->
[
a
]
->
IO
[
b
]
mapConcurrentlyChunked
f
ts
=
do
caps
<-
getNumCapabilities
let
n
=
1
`
max
`
(
length
ts
`
div
`
caps
)
concat
<$>
mapConcurrently
(
mapM
f
)
(
chunksOf
n
ts
)
--terms' :: Patterns -> Text -> Corpus [[Text]]
terms'
::
Applicative
f
=>
Patterns
->
Text
->
f
[[
Text
]]
terms'
pats
txt
=
pure
$
concat
$
extractTermsWithList
pats
txt
-- | TODO Minimal Example
--testCooc = do
-- let patterns = buildPatterns testTermList
-- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
-- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
testCorpus
::
[(
Int
,
[
Text
])]
testCorpus
=
[
(
1998
,
[
pack
"The beees"
])
,
(
1999
,
[
pack
"The bees and the flowers"
--, pack "The bees and the flowers"
])
]
testTermList
::
TermList
testTermList
=
[
([
pack
"bee"
],
[[
pack
"bees"
]])
,
([
pack
"flower"
],
[[
pack
"flowers"
]])
]
opts
=
info
(
helper
<*>
allOptions
)
(
fullDesc
<>
progDesc
"CLI for the gargantext-server"
<>
header
"gargantext-cli tools"
)
allOptions
::
Parser
CLI
allOptions
=
subparser
(
filterTermsAndCoocCmd
<>
obfuscateDBCmd
)
gargantext.cabal
View file @
19c7c2db
...
...
@@ -81,7 +81,7 @@ common optimized
-rtsopts
-with-rtsopts=-N
-Wmissing-signatures
-- When enabled, it swaps the hashing algorithm
-- with a quicker (and less secure) version, which
-- runs faster in tests.
...
...
@@ -89,10 +89,6 @@ flag test-crypto
default: False
manual: True
flag disable-db-obfuscation-executable
default: False
manual: True
-- When enabled, it suppresses at compile time the
-- debug output for the phylo code, so that it doesn't
-- hinder its performance.
...
...
@@ -710,7 +706,11 @@ executable gargantext-cli
, optimized
main-is: Main.hs
other-modules:
CleanCsvCorpus
CLI.CleanCsvCorpus
CLI.FilterTermsAndCooc
CLI.ObfuscateDB
CLI.Types
CLI.Utils
Paths_gargantext
hs-source-dirs:
bin/gargantext-cli
...
...
@@ -720,38 +720,21 @@ executable gargantext-cli
, bytestring ^>= 0.10.12.0
, cassava ^>= 0.5.2.0
, containers ^>= 0.6.5.1
, extra
, extra ^>= 1.7.9
, full-text-search ^>= 0.2.1.4
, gargantext
, gargantext-prelude
, ini ^>= 0.4.1
, optparse-applicative
, optparse-generic ^>= 1.4.7
, postgresql-simple ^>= 0.6.4
, protolude ^>= 0.3.3
, split ^>= 0.2.3.4
, text ^>= 1.2.4.1
, unordered-containers ^>= 0.2.16.0
, vector ^>= 0.12.3.0
executable gargantext-db-obfuscation
import:
defaults
, optimized
main-is: Main.hs
other-modules:
Paths_gargantext
hs-source-dirs:
bin/gargantext-db-obfuscation
if flag(disable-db-obfuscation-executable)
buildable: False
else
build-depends:
extra
, gargantext
, gargantext-prelude
, optparse-simple
, postgresql-simple ^>= 0.6.4
, text
executable gargantext-import
import:
defaults
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment