[flow] rewrite the broken mDateSplit function

Thing is, it parsed dates incorrectly, e.g. "1 18 8" -> "1188-01-01
00:00:00 UTC"
parent 5c9ecde5
Pipeline #7946 passed with stages
in 50 minutes and 31 seconds
......@@ -227,7 +227,6 @@ library
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.Types
Gargantext.Core.Text.Corpus.Parsers.Date
Gargantext.Core.Text.Corpus.Parsers.TSV
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.List
......@@ -258,6 +257,7 @@ library
Gargantext.Core.Types.Query
Gargantext.Core.Utils
Gargantext.Core.Utils.Aeson
Gargantext.Core.Utils.DateUtils
Gargantext.Core.Utils.Prefix
Gargantext.Core.Viz.Graph.Index
Gargantext.Core.Viz.Graph.Tools
......@@ -419,7 +419,6 @@ library
Gargantext.Core.Text.Terms.Multi.PosTagging
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
Gargantext.Core.Types.Search
Gargantext.Core.Utils.DateUtils
Gargantext.Core.Utils.Swagger
Gargantext.Core.Viz
Gargantext.Core.Viz.Chart
......
......@@ -30,11 +30,11 @@ import Gargantext.API.Worker (serveWorkerAPI)
import Gargantext.Core (Lang(..))
import Gargantext.Core.NLP (HasNLPServer)
import Gargantext.Core.NodeStory.Types (HasNodeStoryEnv)
import Gargantext.Core.Text.Corpus.Parsers.Date (mDateSplit)
import Gargantext.Core.Text.Terms (TermType(..))
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Worker.Jobs.Types (WorkSplit(..))
import Gargantext.Core.Worker.Jobs.Types qualified as Jobs
import Gargantext.Core.Utils.DateUtils (parseFlexibleTimeWithSplit)
import Gargantext.Database.Action.Flow (addDocumentsToHyperCorpus)
import Gargantext.Database.Action.Flow.Types ( FlowCmdM )
import Gargantext.Database.Admin.Types.Hyperdata.Corpus ( HyperdataCorpus )
......@@ -78,7 +78,7 @@ documentUpload nId doc = do
Nothing -> panicTrace $ T.pack $ "[G.A.N.DU] Node has no corpus parent: " <> show nId
let mDateS = Just $ view du_date doc
let (theFullDate, (year, month, day)) = mDateSplit mDateS
let (theFullDate, (year, month, day)) = parseFlexibleTimeWithSplit mDateS
let hd = HyperdataDocument { _hd_bdd = Nothing
, _hd_doi = Nothing
......
......@@ -31,11 +31,11 @@ import Gargantext.API.Routes.Named.Document qualified as Named
import Gargantext.API.Worker (serveWorkerAPI)
import Gargantext.Core (Lang(..))
import Gargantext.Core.NodeStory (currentVersion, hasNodeStory)
import Gargantext.Core.Text.Corpus.Parsers.Date (split')
import Gargantext.Core.Text.Corpus.Parsers.FrameWrite (Author(..), Parsed(..), parseLines, text2titleParagraphs)
import Gargantext.Core.Text.Terms (TermType(..))
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Worker.Jobs.Types qualified as Jobs
import Gargantext.Core.Utils.DateUtils (yymmddSplit)
import Gargantext.Database.Action.Flow (flowDataText, DataText(..))
import Gargantext.Database.Action.Flow.Types (FlowCmdM)
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
......@@ -132,7 +132,7 @@ hyperdataDocumentFromFrameWrite lang paragraphSize (node, contents) =
authors' = T.concat $ authorJoinSingle <$> authors
--{-
(year',month',day') = split' (node^. node_date)
(year',month',day') = yymmddSplit (node^. node_date)
date' = Just $ T.concat [ T.pack $ show year', "-"
, T.pack $ show month', "-"
, T.pack $ show day'
......
......@@ -16,8 +16,8 @@ import Conduit ( ConduitT, (.|), mapMC )
import Data.LanguageCodes qualified as ISO639
import Data.Map.Strict qualified as Map
import Data.Text (pack)
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Core.Utils (nonemptyIntercalate)
import Gargantext.Core.Utils.DateUtils (parseFlexibleTimeWithSplit)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude hiding (intercalate)
......@@ -43,7 +43,7 @@ toDoc' :: Maybe ISO639.ISO639_1 -> HAL.Document -> IO HyperdataDocument
toDoc' la (HAL.Document { .. }) = do
-- printDebug "[toDoc corpus] h" h
let mDateS = _document_date <|> Just (pack $ show Defaults.year)
let (utctime, (pub_year, pub_month, pub_day)) = Date.mDateSplit mDateS
let (utctime, (pub_year, pub_month, pub_day)) = parseFlexibleTimeWithSplit mDateS
let abstractDefault = unwords _document_abstract
let abstract = case la of
Nothing -> abstractDefault
......@@ -57,7 +57,7 @@ toDoc' la (HAL.Document { .. }) = do
, _hd_institutes = Just $ nonemptyIntercalate ", " $ zipWith (\affialition structId -> affialition <> " | " <> structId) _document_authors_affiliations $ map show _document_struct_id
, _hd_source = Just $ maybe "Nothing" identity _document_source
, _hd_abstract = Just abstract
, _hd_publication_date = fmap show utctime
, _hd_publication_date = show <$> utctime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
......
......@@ -22,11 +22,11 @@ import Data.Text qualified as Text
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers (cleanText)
import Gargantext.Core.Text.Corpus.Parsers.TSV (writeDocs2Tsv)
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Core.Utils.DateUtils (parseFlexibleTimeWithSplit)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude hiding (get)
import Isidore qualified as Isidore
import Isidore qualified
import Isidore.Client
import Servant.Client ( ClientError(DecodeFailure) )
......@@ -74,7 +74,7 @@ isidoreToDoc lang (IsidoreDoc t a d u s as) = do
langText (ArrayText ts ) = Text.unwords $ map langText ts
let mDateS = maybe (Just $ Text.pack $ show Defaults.year) Just d
let (utcTime, (pub_year, pub_month, pub_day)) = Date.mDateSplit mDateS
let (utcTime, (pub_year, pub_month, pub_day)) = parseFlexibleTimeWithSplit mDateS
pure HyperdataDocument
{ _hd_bdd = Just "Isidore"
......
......@@ -47,7 +47,6 @@ import Data.Text qualified as DT
import Data.Tuple.Extra (both) -- , first, second)
import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Core.Text.Corpus.Parsers.FrameWrite (text2titleParagraphs)
import Gargantext.Core.Text.Corpus.Parsers.Iramuteq qualified as Iramuteq
import Gargantext.Core.Text.Corpus.Parsers.JSON (parseJSONC, parseIstex)
......@@ -57,6 +56,7 @@ import Gargantext.Core.Text.Corpus.Parsers.TSV (parseHal, parseTsv, parseTsvC)
import Gargantext.Core.Text.Corpus.Parsers.Types
import Gargantext.Core.Text.Corpus.Parsers.WOS qualified as WOS
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Utils.DateUtils (parseFlexibleTimeWithSplit)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude hiding (show, undefined)
import Gargantext.Utils.Jobs.Error
......@@ -250,7 +250,7 @@ toDoc ff d = do
let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
-- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
let (utcTime, (pub_year, pub_month, pub_day)) = Date.mDateSplit dateToParse
let (utcTime, (pub_year, pub_month, pub_day)) = parseFlexibleTimeWithSplit dateToParse
let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
, _hd_doi = lookup "doi" d
......
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.Date
Description : Some utils to parse dates
Copyright : (c) CNRS 2017-present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
According to the language of the text, parseDateRaw returns date as Text:
TODO : Add some tests
import Gargantext.Core.Text.Corpus.Parsers.Date as DGP
DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
-}
{-# OPTIONS_GHC -fno-warn-deprecations #-}
{-# LANGUAGE TypeFamilies #-}
module Gargantext.Core.Text.Corpus.Parsers.Date (
dateSplit
, mDateSplit
, defaultDay
, defaultUTCTime
, split'
) where
import Data.List qualified as List
import Data.Text (unpack, splitOn, replace)
import Data.Time (defaultTimeLocale, iso8601DateFormat, parseTimeM, toGregorian)
import Data.Time.Calendar qualified as DTC
import Data.Time.Clock (UTCTime(..), secondsToDiffTime)
import Gargantext.Prelude hiding (replace)
------------------------------------------------------------------------
-- | Parse date to Ints
-- TODO add hours, minutes and seconds
dateSplit :: Text -> Either Text (UTCTime, (Year, Month, Day))
dateSplit txt = mkSplit <$> parse txt
where
mkSplit utcTime =
let (y, m, d) = split' utcTime in
(utcTime, (y, m, d))
mDateSplit :: Maybe Text -> (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
mDateSplit Nothing = (Nothing, (Nothing, Nothing, Nothing))
mDateSplit (Just md) =
case dateSplit md of
Left _err -> (Nothing, (Nothing, Nothing, Nothing))
Right (ut, (y, m, d)) -> (Just ut, (Just y, Just m, Just d))
split' :: UTCTime -> (Year, Month, Day)
split' (UTCTime day _) = (fromIntegral y, m, d)
where
(y,m,d) = toGregorian day
type Year = Int
type Month = Int
type Day = Int
------------------------------------------------------------------------
-- | Date Parser
-- Parses dates mentions in full text given the language.
-- >>> parse FR (pack "1 avril 1900 à 19H")
-- 1900-04-01 19:00:00 UTC
-- >>> parse EN (pack "April 1 1900")
-- 1900-04-01 00:00:00 UTC
parse :: Text -> Either Text UTCTime
parse s = do
-- printDebug "Date: " s
let result = dateFlow (DucklingFailure s)
--printDebug "Date': " dateStr'
case result of
DateFlowSuccess ok -> Right ok
DateFlowFailure -> Left "[G.C.T.C.Parsers.Date] DateFlowFailure"
-- DateFlowFailure -> (withDebugMode (DebugMode True)
-- "[G.C.T.P.T.Date parse]" s
-- $ getCurrentTime)
_ -> Left "[G.C.T.C.Parsers.Date] parse: Should not happen"
data DateFlow = DucklingSuccess { ds_result :: Text }
| DucklingFailure { df_result :: Text }
| ReadFailure1 { rf1_result :: Text }
| ReadFailure2 { rf2_result :: Text }
| DateFlowSuccess { success :: UTCTime }
| DateFlowFailure
deriving Show
--{-
dateFlow :: DateFlow -> DateFlow
dateFlow (DucklingSuccess res) = case (head $ splitOn "." res) of
Nothing -> dateFlow (ReadFailure1 res)
Just re -> case readDate res of
Nothing -> dateFlow (ReadFailure1 re)
Just ok -> DateFlowSuccess ok
--dateFlow (DucklingFailure txt) = case readDate $ replace " " "T" txt of
dateFlow (DucklingFailure txt) = case readDate (fromMaybe "" $ headMay $ List.filter (/= "") $ splitOn " " txt) of
Nothing -> dateFlow (ReadFailure1 txt)
Just ok -> DateFlowSuccess ok
dateFlow (ReadFailure1 txt) = case readDate txt of
Nothing -> dateFlow $ ReadFailure2 txt
Just ok -> DateFlowSuccess ok
dateFlow (ReadFailure2 txt) = case readDate $ replace " " "" txt <> "-01-01" of
Nothing -> DateFlowFailure
Just ok -> DateFlowSuccess ok
dateFlow _ = DateFlowFailure
--}
readDate :: Text -> Maybe UTCTime
readDate txt = do
--let format = cs $ iso8601DateFormat (Just "%F %H:%M:%S")
let format = cs $ iso8601DateFormat Nothing
parseTimeM True defaultTimeLocale (unpack format) (cs txt)
defaultDay :: DTC.Day
defaultDay = DTC.fromGregorian 1 1 1
defaultUTCTime :: UTCTime
defaultUTCTime = UTCTime { utctDay = defaultDay
, utctDayTime = secondsToDiffTime 0 }
......@@ -18,8 +18,8 @@ module Gargantext.Core.Text.Corpus.Parsers.JSON.Istex where
import Data.Text qualified as T
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Core.Utils (nonemptyIntercalate)
import Gargantext.Core.Utils.DateUtils (parseFlexibleTimeWithSplit)
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude hiding (length)
......@@ -32,7 +32,7 @@ toDoc :: Lang -> ISTEX.Document -> IO HyperdataDocument
toDoc la (ISTEX.Document i t a ab d s) = do
--printDebug "ISTEX date" d
let mDateS = maybe (Just $ T.pack $ show Defaults.year) (Just . T.pack . show) d
let (utctime, (pub_year, pub_month, pub_day)) = Date.mDateSplit mDateS
let (utctime, (pub_year, pub_month, pub_day)) = parseFlexibleTimeWithSplit mDateS
--printDebug "toDoc Istex" (utctime, (pub_year, pub_month, pub_day))
pure $ HyperdataDocument { _hd_bdd = Just "Istex"
, _hd_doi = Just i
......
......@@ -11,8 +11,11 @@ Portability : POSIX
module Gargantext.Core.Utils.DateUtils where
import Gargantext.Prelude
import Data.Text qualified as T
import Data.Time (UTCTime, toGregorian, utctDay)
import Data.Time.Format (parseTimeM, defaultTimeLocale)
import Data.Time.Format.ISO8601 (iso8601ParseM)
import Gargantext.Prelude
--
--readInt :: IO [Char] -> IO Int
......@@ -38,3 +41,32 @@ averageLength l = fromIntegral (sum (map length l)) / fromIntegral (length l)
-- c <- getCurrentTime
-- print c -- $ toYear $ toGregorian $ utctDay c
-- | Parse with multiple format attempts
parseFlexibleTime :: Text -> Maybe UTCTime
parseFlexibleTime t = msum
[ iso8601ParseM s
, parseTimeM True defaultTimeLocale "%Y-%m-%dT%H:%M:%S" s
, parseTimeM True defaultTimeLocale "%Y-%m-%d %H:%M:%S" s
, parseTimeM True defaultTimeLocale "%Y-%m-%d %H:%M:%S UTC" s
, parseTimeM True defaultTimeLocale "%Y-%m-%d" s
, parseTimeM True defaultTimeLocale "%Y-%m-%d" (s <> "-01-01") -- only year was given
, parseTimeM True defaultTimeLocale "%Y-%m-%d" (s <> "-01") -- only year-month was given
]
where
s = T.unpack t
-- | This specific 'Maybe' conglomeration is used in parsers for publication date
parseFlexibleTimeWithSplit :: Maybe Text -> (Maybe UTCTime, (Maybe Int, Maybe Int, Maybe Int))
parseFlexibleTimeWithSplit Nothing = (Nothing, (Nothing, Nothing, Nothing))
parseFlexibleTimeWithSplit (Just t) =
case parseFlexibleTime t of
Nothing -> (Nothing, (Nothing, Nothing, Nothing))
Just dt ->
let (y, m, d) = yymmddSplit dt
in (Just dt, (Just y, Just m, Just d))
yymmddSplit :: UTCTime -> (Int, Int, Int)
yymmddSplit t = (fromIntegral y, m, d)
where
(y,m,d) = toGregorian $ utctDay t
......@@ -15,14 +15,14 @@ commentary with @some markup@.
module Test.Parsers.Date where
import Test.Hspec
import Data.Time.Clock (UTCTime(..), secondsToDiffTime)
import Data.Time.Calendar.OrdinalDate (fromOrdinalDate)
-----------------------------------------------------------
import Gargantext.Core.Utils.DateUtils (parseFlexibleTimeWithSplit)
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.Date (dateSplit)
import Test.Hspec
-----------------------------------------------------------
......@@ -32,7 +32,12 @@ testDateSplit = do
it "works for simple date parsing" $ do
let utc = UTCTime { utctDay = fromOrdinalDate 2010 4
, utctDayTime = secondsToDiffTime 0 }
dateSplit "2010-01-04" `shouldBe` Right (utc, (2010, 1, 4))
parseFlexibleTimeWithSplit (Just "2010-01-04") `shouldBe` (Just utc, (Just 2010, Just 1, Just 4))
it "works for year-month" $ do
parseFlexibleTimeWithSplit (Just "2012-10") `shouldSatisfy` isJust . fst
parseFlexibleTimeWithSplit (Just "2012-10") `shouldSatisfy` (\(_, (y, m, _)) -> y == Just 2012 && m == Just 10)
it "throws error for year-month" $ do
dateSplit "2010-01" `shouldSatisfy` isLeft
it "works for year" $ do
parseFlexibleTimeWithSplit (Just "2011") `shouldSatisfy` isJust . fst
parseFlexibleTimeWithSplit (Just "2011") `shouldSatisfy` (\(_, (y, _m, _)) -> y == Just 2011)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment