Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
153
Issues
153
List
Board
Labels
Milestones
Merge Requests
9
Merge Requests
9
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
1a498118
Commit
1a498118
authored
Jan 31, 2023
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FIX] WOS Parser
parent
53e86575
Pipeline
#3628
failed with stage
in 53 minutes and 54 seconds
Changes
6
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
135 additions
and
26 deletions
+135
-26
CHANGELOG.md
CHANGELOG.md
+1
-0
Parsers.hs
src/Gargantext/Core/Text/Corpus/Parsers.hs
+29
-17
RIS.hs
src/Gargantext/Core/Text/Corpus/Parsers/RIS.hs
+2
-4
WOS.hs
src/Gargantext/Core/Text/Corpus/Parsers/WOS.hs
+2
-1
Management.sh
src/Gargantext/Core/Text/List/Management.sh
+96
-0
Flow.hs
src/Gargantext/Database/Action/Flow.hs
+5
-4
No files found.
CHANGELOG.md
View file @
1a498118
...
...
@@ -3,6 +3,7 @@
*
[
BACK
][
FIX
]
Username and email to lowerCase always. Use migration script please to avoid errors.
*
[
BACK
][
FIX
][
Ngrams Change insert causes Database error (#173)
](
https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/173
)
*
[
FRONTED
][
CLEAN
]
Removing Isidore DB for now
*
[
BACK
][
FIX
]
WOS Parser
## Version 0.0.6.9.3
...
...
src/Gargantext/Core/Text/Corpus/Parsers.hs
View file @
1a498118
...
...
@@ -20,14 +20,15 @@ please follow the types.
{-# LANGUAGE PackageImports #-}
module
Gargantext.Core.Text.Corpus.Parsers
(
FileFormat
(
..
),
FileType
(
..
),
clean
,
parseFile
,
cleanText
,
parseFormatC
)
module
Gargantext.Core.Text.Corpus.Parsers
(
FileFormat
(
..
),
FileType
(
..
),
clean
,
parseFile
,
cleanText
,
parseFormatC
,
splitOn
)
where
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import
"zip"
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
import
Conduit
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Control.Monad.Trans.Control
(
MonadBaseControl
)
import
Control.Monad
(
join
)
import
Control.Monad.Trans.Control
(
MonadBaseControl
)
import
Data.Attoparsec.ByteString
(
parseOnly
,
Parser
)
import
Data.Either
(
Either
(
..
))
import
Data.Either.Extra
(
partitionEithers
)
...
...
@@ -38,25 +39,24 @@ import Data.String()
import
Data.Text
(
Text
,
intercalate
,
pack
,
unpack
)
import
Data.Text.Encoding
(
decodeUtf8
)
import
Data.Tuple.Extra
(
both
,
first
,
second
)
import
Gargantext.API.Node.Corpus.New.Types
(
FileFormat
(
..
))
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
parseHal
,
parseCsv
,
parseCsvC
)
import
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
(
presseEnrich
)
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Prelude
import
System.FilePath
(
FilePath
(),
takeExtension
)
import
System.IO.Temp
(
emptySystemTempFile
)
import
qualified
Data.ByteString
as
DB
import
qualified
Data.ByteString.Char8
as
DBC
import
qualified
Data.ByteString.Lazy
as
DBL
import
qualified
Data.Map
as
DM
import
qualified
Data.Text
as
DT
import
qualified
Prelude
import
System.IO.Temp
(
emptySystemTempFile
)
import
Gargantext.API.Node.Corpus.New.Types
(
FileFormat
(
..
))
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Prelude
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
parseHal
,
parseCsv
,
parseCsvC
)
import
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
(
presseEnrich
)
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import
qualified
Gargantext.Core.Text.Corpus.Parsers.Date
as
Date
import
qualified
Gargantext.Core.Text.Corpus.Parsers.RIS
as
RIS
import
qualified
Gargantext.Core.Text.Corpus.Parsers.WOS
as
WOS
import
qualified
Prelude
import
Gargantext.Database.Query.Table.Ngrams
(
NgramsType
(
..
))
------------------------------------------------------------------------
type
ParseError
=
String
...
...
@@ -168,12 +168,15 @@ parseFormatC _ _ _ = undefined
parseFile
::
FileType
->
FileFormat
->
FilePath
->
IO
(
Either
Prelude
.
String
[
HyperdataDocument
])
parseFile
CsvHal
Plain
p
=
parseHal
p
parseFile
CsvGargV3
Plain
p
=
parseCsv
p
parseFile
RisPresse
Plain
p
=
do
docs
<-
join
$
mapM
(
toDoc
RIS
)
<$>
snd
<$>
enrichWith
RisPresse
<$>
readFileWith
RIS
p
pure
$
Right
docs
parseFile
WOS
Plain
p
=
do
docs
<-
join
$
mapM
(
toDoc
WOS
)
<$>
snd
<$>
enrichWith
WOS
<$>
readFileWith
WOS
p
pure
$
Right
docs
parseFile
ff
_
p
=
do
docs
<-
join
$
mapM
(
toDoc
ff
)
<$>
snd
<$>
enrichWith
ff
<$>
readFileWith
ff
p
pure
$
Right
docs
...
...
@@ -184,19 +187,19 @@ toDoc ff d = do
-- let abstract = lookup "abstract" d
let
lang
=
EN
-- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
let
dateToParse
=
DT
.
replace
"
-"
" "
<$>
lookup
"PY"
d
<>
Just
" "
<>
lookup
"publication_date"
d
let
dateToParse
=
DT
.
replace
"
"
""
<$>
lookup
"PY"
d
-- <> Just " " <> lookup "publication_date" d
printDebug
"[G.C.T.C.Parsers] dateToParse"
dateToParse
(
utcTime
,
(
pub_year
,
pub_month
,
pub_day
))
<-
Date
.
dateSplit
lang
dateToParse
pure
HyperdataDocument
{
_hd_bdd
=
Just
$
DT
.
pack
$
show
ff
let
hd
=
HyperdataDocument
{
_hd_bdd
=
Just
$
DT
.
pack
$
show
ff
,
_hd_doi
=
lookup
"doi"
d
,
_hd_url
=
lookup
"URL"
d
,
_hd_uniqId
=
Nothing
,
_hd_uniqIdBdd
=
Nothing
,
_hd_page
=
Nothing
,
_hd_title
=
lookup
"title"
d
,
_hd_authors
=
Nothing
,
_hd_institutes
=
lookup
"
author
s"
d
,
_hd_authors
=
lookup
"authors"
d
,
_hd_institutes
=
lookup
"
institute
s"
d
,
_hd_source
=
lookup
"source"
d
,
_hd_abstract
=
lookup
"abstract"
d
,
_hd_publication_date
=
fmap
(
DT
.
pack
.
show
)
utcTime
...
...
@@ -207,6 +210,8 @@ toDoc ff d = do
,
_hd_publication_minute
=
Nothing
,
_hd_publication_second
=
Nothing
,
_hd_language_iso2
=
Just
$
(
DT
.
pack
.
show
)
lang
}
printDebug
"[G.C.T.C.Parsers] HyperdataDocument"
hd
pure
hd
enrichWith
::
FileType
->
(
a
,
[[[(
DB
.
ByteString
,
DB
.
ByteString
)]]])
->
(
a
,
[[(
Text
,
Text
)]])
...
...
@@ -267,3 +272,10 @@ clean txt = DBC.map clean' txt
clean'
'
\t
'
=
' '
clean'
';'
=
'.'
clean'
c
=
c
--
splitOn
::
NgramsType
->
Maybe
Text
->
Text
->
[
Text
]
splitOn
Authors
(
Just
"WOS"
)
=
(
DT
.
splitOn
"; "
)
splitOn
_
_
=
(
DT
.
splitOn
", "
)
src/Gargantext/Core/Text/Corpus/Parsers/RIS.hs
View file @
1a498118
...
...
@@ -23,7 +23,7 @@ import Data.List (lookup)
import
Control.Applicative
import
Data.Attoparsec.ByteString
(
Parser
,
try
,
takeTill
,
take
,
many1
)
import
Data.Attoparsec.ByteString.Char8
(
isEndOfLine
)
import
Data.ByteString
(
ByteString
,
concat
)
import
Data.ByteString
(
ByteString
,
intercalate
)
import
Gargantext.Prelude
hiding
(
takeWhile
,
take
)
import
qualified
Data.List
as
DL
-------------------------------------------------------------
...
...
@@ -55,7 +55,7 @@ fieldWith n = do
let
txts'
=
case
DL
.
length
txts
>
0
of
True
->
txts
False
->
[]
pure
(
name
,
concat
([
txt
]
<>
txts'
))
pure
(
name
,
intercalate
";"
([
txt
]
<>
txts'
))
lines
::
Parser
[
ByteString
]
...
...
@@ -70,5 +70,3 @@ onField :: ByteString -> (ByteString -> [(ByteString, ByteString)])
->
[(
ByteString
,
ByteString
)]
->
[(
ByteString
,
ByteString
)]
onField
k
f
m
=
m
<>
(
maybe
[]
f
(
lookup
k
m
)
)
src/Gargantext/Core/Text/Corpus/Parsers/WOS.hs
View file @
1a498118
...
...
@@ -52,6 +52,7 @@ keys field
|
field
==
"TI"
=
"title"
|
field
==
"SO"
=
"source"
|
field
==
"DI"
=
"doi"
|
field
==
"PY"
=
"publication_date"
|
field
==
"PD"
=
"publication_date"
|
field
==
"SP"
=
"institutes"
|
field
==
"AB"
=
"abstract"
|
otherwise
=
field
src/Gargantext/Core/Text/List/Management.sh
0 → 100644
View file @
1a498118
{
-|
Module : Gargantext.Core.Text.Ngrams.List.Management
Description : Tools to manage lists
Copyright :
(
c
)
CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-
}
{
-# LANGUAGE ScopedTypeVariables
#-}
{
-# LANGUAGE TemplateHaskell
#-}
module Gargantext.Core.Text.List.Management
where
{
-
import Data.HashMap.Strict
(
HashMap
)
import Data.Map
(
Map
)
import Gargantext.API.Ngrams
import Gargantext.API.Ngrams.Types
(
NgramsElement, NgramsTerm
(
..
))
import Gargantext.Database.Action.Flow.Types
import Gargantext.API.Ngrams.Tools
(
getListNgrams
)
import Gargantext.Core.NodeStory
import Gargantext.Core.Text
(
size
)
import Gargantext.Core.Text.List.Group
import Gargantext.Core.Text.List.Group.Prelude
import Gargantext.Core.Text.List.Group.WithStem
import Gargantext.Core.Text.List.Social
import Gargantext.Core.Text.List.Social.Prelude
import Gargantext.Core.Text.Metrics
(
scored
', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
import Gargantext.Core.Types (ListType(..), CorpusId, ListId)
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsUser, getContextsByNgramsOnlyUser)
import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
import Gargantext.Database.Admin.Types.Node (NodeId)
import Gargantext.Database.Prelude (CmdM)
import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
import Gargantext.Database.Query.Table.Node (defaultList, getClosestParentIdByType)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
import Gargantext.Database.Query.Tree.Error (HasTreeError)
import Gargantext.Database.Action.Metrics.NgramsByContext (getOccByNgramsOnlyFast'
)
import Gargantext.Database.Schema.Ngrams
(
NgramsType
(
..
)
, Ngrams
(
..
))
import Gargantext.Prelude
import qualified Data.HashMap.Strict as HashMap
import qualified Data.HashSet as HashSet
import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Set as Set
import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
restrictListSize
:: forall
env
err m.
(
HasNodeStory
env
err m, FlowCmdM
env
err m
)
=>
CorpusId
-> ListId
-> NgramsType
-> ListType
-> Int
--
^ number of ngram pairs to keep
-> m
()
restrictListSize corpusId listId ngramsType listType size
=
do
ngrams <- getListNgrams
[
listId] ngramsType
--
corpus_id <- getClosestParentIdByType
occurrences <- getOccByNgramsOnlyFast
' corpusId
listId
ngramsType
(HashMap.keys ngrams)
ngrams'
<- filterWith listType size occurrences ngrams
_ <- setListNgrams listId ngramsType ngrams
'
return ()
where filterWith :: ListType -> Int -> HashMap NgramsTerm Int
-> HashMap NgramsTerm NgramsRepoElement
-> m (Map NgramsTerm NgramsRepoElement)
filterWith listType'
size occs ngrams
=
HashMap.filter with ngrams
where
with nre
=
case
(&&
)
<
$>
Just
(
nre^.nre_list
==
listType
)
<
*
>
(
HashMap.lookup
(
nre^.nre_root
)
occs
&&
-
}
src/Gargantext/Database/Action/Flow.hs
View file @
1a498118
...
...
@@ -75,11 +75,11 @@ import qualified Data.Conduit as C
import
Gargantext.API.Admin.Orchestrator.Types
(
JobLog
(
..
))
import
Gargantext.Core
(
Lang
(
..
),
PosTagAlgo
(
..
))
import
Gargantext.Core.Ext.IMT
(
toSchoolName
)
--
import Gargantext.Core.Ext.IMT (toSchoolName)
import
Gargantext.Core.Ext.IMTUser
(
readFile_Annuaire
)
import
Gargantext.Core.Flow.Types
import
Gargantext.Core.Text
import
Gargantext.Core.Text.Corpus.Parsers
(
parseFile
,
FileFormat
,
FileType
)
import
Gargantext.Core.Text.Corpus.Parsers
(
parseFile
,
FileFormat
,
FileType
,
splitOn
)
import
Gargantext.Core.Text.List
(
buildNgramsLists
)
import
Gargantext.Core.Text.List.Group.WithStem
(
{-StopSize(..),-}
GroupParams
(
..
))
import
Gargantext.Core.Text.List.Social
(
FlowSocialListWith
(
..
))
...
...
@@ -550,13 +550,14 @@ instance ExtractNgramsT HyperdataDocument
$
_hd_source
doc
institutes
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
map
toSchoolName
.
(
T
.
splitOn
", "
))
$
maybe
[
"Nothing"
]
(
splitOn
Institutes
(
doc
^.
hd_bdd
))
$
_hd_institutes
doc
authors
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
T
.
splitOn
", "
)
$
maybe
[
"Nothing"
]
(
splitOn
Authors
(
doc
^.
hd_bdd
)
)
$
_hd_authors
doc
termsWithCounts'
<-
map
(
\
(
t
,
cnt
)
->
(
enrichedTerms
(
lang'
^.
tt_lang
)
CoreNLP
NP
t
,
cnt
))
<$>
concat
<$>
liftBase
(
extractTerms
lang'
$
hasText
doc
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment