From 84c042817462da9cc2f1af075b89f63b18115905 Mon Sep 17 00:00:00 2001 From: Christian Merten <christian@merten.dev> Date: Sun, 11 Jun 2023 19:20:13 +0200 Subject: [PATCH] corpus/parsers: add gitlab issue parser --- gargantext.cabal | 3 +- .../Core/Text/Corpus/Parsers/Gitlab.hs | 67 +++++++++++++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 src/Gargantext/Core/Text/Corpus/Parsers/Gitlab.hs diff --git a/gargantext.cabal b/gargantext.cabal index 1fa6bbd2..f3e418f1 100644 --- a/gargantext.cabal +++ b/gargantext.cabal @@ -5,7 +5,7 @@ cabal-version: 1.12 -- see: https://github.com/sol/hpack name: gargantext -version: 0.0.6.9.9.6.2 +version: 0.0.6.9.9.6.2 synopsis: Search, map, share description: Please see README.md category: Data @@ -194,6 +194,7 @@ library Gargantext.Core.Text.Corpus.Parsers.Date Gargantext.Core.Text.Corpus.Parsers.Date.Attoparsec Gargantext.Core.Text.Corpus.Parsers.FrameWrite + Gargantext.Core.Text.Corpus.Parsers.Gitlab Gargantext.Core.Text.Corpus.Parsers.GrandDebat Gargantext.Core.Text.Corpus.Parsers.Iramuteq Gargantext.Core.Text.Corpus.Parsers.Isidore diff --git a/src/Gargantext/Core/Text/Corpus/Parsers/Gitlab.hs b/src/Gargantext/Core/Text/Corpus/Parsers/Gitlab.hs new file mode 100644 index 00000000..57f40baf --- /dev/null +++ b/src/Gargantext/Core/Text/Corpus/Parsers/Gitlab.hs @@ -0,0 +1,67 @@ +module Gargantext.Core.Text.Corpus.Parsers.Gitlab ( + Issue(..), gitlabIssue2hyperdataDocument, readFile_Issues, readFile_IssuesAsDocs +) where + +import Data.Aeson +import Data.Time +import qualified Data.Text as DT +import qualified Data.ByteString.Lazy as DBL +import System.FilePath (FilePath) + +import Gargantext.Prelude +import Gargantext.Core (Lang(..)) +import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) + +data Issue = Issue { _issue_id :: !Int + , _issue_title :: !DT.Text + , _issue_content :: !DT.Text + , _issue_created :: !LocalTime + , _issue_closed :: !(Maybe UTCTime) + } + deriving (Show) + +instance FromJSON Issue where + parseJSON = withObject "Issue" $ \v -> Issue + <$> v .: "c0" -- id + <*> v .: "c1" -- title + <*> v .: "c2" -- content + <*> v .: "c3" -- creation time + <*> v .:? "c4" -- close time + +gitlabIssue2hyperdataDocument :: Issue -> HyperdataDocument +gitlabIssue2hyperdataDocument issue = HyperdataDocument + { _hd_bdd = Nothing + , _hd_doi = Nothing + , _hd_url = Nothing + , _hd_uniqId = Nothing + , _hd_uniqIdBdd = Nothing + , _hd_page = Nothing + , _hd_title = Just (_issue_title issue) + , _hd_authors = Nothing + , _hd_institutes = Nothing + , _hd_source = Nothing + , _hd_abstract = Just (_issue_content issue) + , _hd_publication_date = Just $ DT.pack $ show date + , _hd_publication_year = Just $ fromIntegral year + , _hd_publication_month = Just month + , _hd_publication_day = Just day + , _hd_publication_hour = Just (todHour tod) + , _hd_publication_minute = Just (todMin tod) + , _hd_publication_second = Just (round $ todSec tod) + , _hd_language_iso2 = Just $ (DT.pack . show) lang + } + where lang = EN + date = _issue_created issue + (year, month, day) = toGregorian $ localDay date + tod = localTimeOfDay date + +readFile_Issues :: FilePath -> IO [Issue] +readFile_Issues fp = do + raw <- DBL.readFile fp + let mayIssues = decode raw + case mayIssues of + Just is -> pure is + Nothing -> pure [] + +readFile_IssuesAsDocs :: FilePath -> IO [HyperdataDocument] +readFile_IssuesAsDocs = fmap (fmap gitlabIssue2hyperdataDocument) . readFile_Issues -- 2.21.0