Commit 55ae7b50 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[API] simplify parsing of date

We don't need the IO monad there, just added a default date for
Duckling to make the parser.
parent 70a94c12
......@@ -89,10 +89,9 @@ documentUpload _uId nId doc logStatus = do
Just c -> c
Nothing -> panic $ T.pack $ "[G.A.N.DU] Node has no corpus parent: " <> show nId
(theFullDate, (year, month, day)) <- liftBase
$ dateSplit EN
$ Just
$ view du_date doc <> "T:0:0:0"
let (theFullDate, (year, month, day)) = dateSplit EN
$ Just
$ view du_date doc <> "T:0:0:0"
let hd = HyperdataDocument { _hd_bdd = Nothing
, _hd_doi = Nothing
......
......@@ -27,28 +27,29 @@ import qualified HAL.Doc.Corpus as HAL
get :: Lang -> Text -> Maybe Integer -> IO [HyperdataDocument]
get la q ml = do
docs <- HAL.getMetadataWith q (Just 0) (fromIntegral <$> ml)
either (panic . pack . show) (\d -> mapM (toDoc' la) $ HAL._docs d) docs
pure $ either (panic . pack . show) (\d -> map (toDoc' la) $ HAL._docs d) docs
toDoc' :: Lang -> HAL.Corpus -> IO HyperdataDocument
toDoc' :: Lang -> HAL.Corpus -> HyperdataDocument
toDoc' la (HAL.Corpus i t ab d s aus affs struct_id) = do
(utctime, (pub_year, pub_month, pub_day)) <- Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) Just d)
pure $ HyperdataDocument { _hd_bdd = Just "Hal"
, _hd_doi = Just $ pack $ show i
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = Just $ intercalate " " t
, _hd_authors = Just $ foldl (\x y -> x <> ", " <> y) "" aus
, _hd_institutes = Just $ foldl (\x y -> x <> ", " <> y) "" $ affs <> map (cs . show) struct_id
, _hd_source = Just $ maybe "Nothing" identity s
, _hd_abstract = Just $ intercalate " " ab
, _hd_publication_date = fmap (pack . show) utctime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (pack . show) la }
let (utctime, (pub_year, pub_month, pub_day)) =
Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) Just d)
HyperdataDocument { _hd_bdd = Just "Hal"
, _hd_doi = Just $ pack $ show i
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = Just $ intercalate " " t
, _hd_authors = Just $ foldl (\x y -> x <> ", " <> y) "" aus
, _hd_institutes = Just $ foldl (\x y -> x <> ", " <> y) "" $ affs <> map (cs . show) struct_id
, _hd_source = Just $ maybe "Nothing" identity s
, _hd_abstract = Just $ intercalate " " ab
, _hd_publication_date = fmap (pack . show) utctime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (pack . show) la }
......@@ -42,7 +42,7 @@ get la l q a = do
iDocs <- either printErr _content <$> Isidore.get l q a
hDocs <- mapM (\d -> isidoreToDoc la d) (toIsidoreDocs iDocs)
let hDocs = map (\d -> isidoreToDoc la d) (toIsidoreDocs iDocs)
pure hDocs
isidore2csvFile :: FilePath -> Lang -> Maybe Isidore.Limit
......@@ -52,7 +52,7 @@ isidore2csvFile fp la li tq aq = do
hdocs <- get la li tq aq
writeDocs2Csv fp hdocs
isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument
isidoreToDoc :: Lang -> IsidoreDoc -> HyperdataDocument
isidoreToDoc l (IsidoreDoc t a d u s as) = do
let
author :: Author -> Text
......@@ -68,26 +68,28 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
langText (OnlyText t2 ) = t2
langText (ArrayText ts ) = Text.intercalate " " $ map langText ts
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit l (maybe (Just $ Text.pack $ show Defaults.year) (Just) d)
let (utcTime, (pub_year, pub_month, pub_day)) = Date.dateSplit l (maybe (Just $ Text.pack $ show Defaults.year) (Just) d)
pure $ HyperdataDocument (Just "Isidore")
Nothing
u
Nothing
Nothing
Nothing
(Just $ cleanText $ langText t)
(creator2text <$> as)
Nothing
(Just $ maybe "Nothing" identity $ _sourceName <$> s)
(cleanText <$> langText <$> a)
(fmap (Text.pack . show) utcTime)
(pub_year)
(pub_month)
(pub_day)
Nothing
Nothing
Nothing
(Just $ (Text.pack . show) l)
HyperdataDocument
{ _hd_bdd = Just "Isidore"
, _hd_doi = Nothing
, _hd_url = u
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = Just $ cleanText $ langText t
, _hd_authors = creator2text <$> as
, _hd_institutes = Nothing
, _hd_source = Just $ maybe "Nothing" identity $ _sourceName <$> s
, _hd_abstract = cleanText <$> langText <$> a
, _hd_publication_date = fmap (Text.pack . show) utcTime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (Text.pack . show) l
}
......@@ -13,6 +13,7 @@ Portability : POSIX
module Gargantext.Core.Text.Corpus.API.Istex
where
import Data.Either (Either(..))
import Data.List (concat)
import Data.Maybe
import Data.Text (Text, pack)
......@@ -28,39 +29,49 @@ import qualified ISTEX.Client as ISTEX
get :: Lang -> Text -> Maybe Integer -> IO [HyperdataDocument]
get la q ml = do
--docs <- ISTEX.getMetadataWith q (fromIntegral <$> ml)
printDebug "[Istex.get] calling getMetadataScrollWith for q" q
printDebug "[Istex.get] calling getMetadataScrollWith for ml" ml
printDebug "[Istex.get] calling getMetadataScrollProgress for la" la
printDebug "[Istex.get] calling getMetadataScrollProgress for q" q
printDebug "[Istex.get] calling getMetadataScrollProgress for ml" ml
-- The "scroll" expects "d/h/m/s/ms" time interval. Let's set it to "1 month"
docs <- ISTEX.getMetadataScrollWith q ((\_n -> pack $ "1m") <$> ml) --(fromIntegral <$> ml)
either (panic . pack . show) (toDoc' la) docs
eDocs <- ISTEX.getMetadataScroll q ((\_n -> pack $ "1m") <$> ml) Nothing 0 --(fromIntegral <$> ml)
printDebug "[Istex.get] will print length" (0 :: Int)
case eDocs of
Left _ -> pure ()
Right (ISTEX.Documents { _documents_hits }) -> printDebug "[Istex.get] length docs" $ length _documents_hits
--ISTEX.getMetadataScrollProgress q ((\_ -> pack $ "1m") <$> ml) Nothing progress errorHandler
pure $ either (panic . pack . show) (toDoc' la) eDocs
-- where
-- progress (ISTEX.ScrollResponse { _scroll_documents = ISTEX.Documents { _documents_hits }}) =
-- printDebug "[Istex.get] got docs: " $ length _documents_hits
-- errorHandler err = printDebug "[Istex.get] error" $ show err
toDoc' :: Lang -> ISTEX.Documents -> IO [HyperdataDocument]
toDoc' la docs' = do
toDoc' :: Lang -> ISTEX.Documents -> [HyperdataDocument]
toDoc' la docs' = map (toDoc la) (ISTEX._documents_hits docs')
--printDebug "ISTEX" (ISTEX._documents_total docs')
mapM (toDoc la) (ISTEX._documents_hits docs')
-- | TODO remove dateSplit here
-- TODO current year as default
toDoc :: Lang -> ISTEX.Document -> IO HyperdataDocument
toDoc :: Lang -> ISTEX.Document -> HyperdataDocument
toDoc la (ISTEX.Document i t a ab d s) = do
(utctime, (pub_year, pub_month, pub_day)) <- Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) (Just . pack . show) d)
pure $ HyperdataDocument { _hd_bdd = Just "Istex"
, _hd_doi = Just i
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = t
, _hd_authors = Just $ foldl (\x y -> x <> ", " <> y) "" (map ISTEX._author_name a)
, _hd_institutes = Just $ foldl (\x y -> x <> ", " <> y) "" (concat $ (map ISTEX._author_affiliations) a)
, _hd_source = Just $ foldl (\x y -> x <> ", " <> y) "" (catMaybes $ map ISTEX._source_title s)
, _hd_abstract = ab
, _hd_publication_date = fmap (pack . show) utctime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (pack . show) la }
let (utctime, (pub_year, pub_month, pub_day)) =
Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) (Just . pack . show) d)
HyperdataDocument { _hd_bdd = Just "Istex"
, _hd_doi = Just i
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = t
, _hd_authors = Just $ foldl (\x y -> x <> ", " <> y) "" (map ISTEX._author_name a)
, _hd_institutes = Just $ foldl (\x y -> x <> ", " <> y) "" (concat $ (map ISTEX._author_affiliations) a)
, _hd_source = Just $ foldl (\x y -> x <> ", " <> y) "" (catMaybes $ map ISTEX._source_title s)
, _hd_abstract = ab
, _hd_publication_date = fmap (pack . show) utctime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (pack . show) la }
......@@ -26,7 +26,6 @@ module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cl
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Control.Monad (join)
import Data.Either(Either(..))
import Data.Either.Extra (partitionEithers)
import Data.List (concat, lookup)
......@@ -79,23 +78,23 @@ data FileFormat = WOS | RIS | RisPresse
-- | XML -- Not Implemented / see :
parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
parseFormat :: FileFormat -> DB.ByteString -> Either Prelude.String [HyperdataDocument]
parseFormat CsvGargV3 bs = parseCsv' $ DBL.fromStrict bs
parseFormat CsvHal bs = parseHal' $ DBL.fromStrict bs
parseFormat RisPresse bs = do
docs <- mapM (toDoc RIS)
let docs = map (toDoc RIS)
<$> snd
<$> enrichWith RisPresse
$ partitionEithers
$ [runParser' RisPresse bs]
pure $ Right docs
Right docs
parseFormat WOS bs = do
docs <- mapM (toDoc WOS)
let docs = map (toDoc WOS)
<$> snd
<$> enrichWith WOS
$ partitionEithers
$ [runParser' WOS bs]
pure $ Right docs
Right docs
parseFormat ZIP bs = do
path <- emptySystemTempFile "parsed-zip"
DB.writeFile path bs
......@@ -111,16 +110,16 @@ parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocum
parseFile CsvHal p = parseHal p
parseFile CsvGargV3 p = parseCsv p
parseFile RisPresse p = do
docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
docs <- map (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
pure $ Right docs
parseFile WOS p = do
docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
docs <- map (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
pure $ Right docs
parseFile ff p = do
docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
docs <- map (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
pure $ Right docs
toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc :: FileFormat -> [(Text, Text)] -> HyperdataDocument
-- TODO use language for RIS
toDoc ff d = do
-- let abstract = lookup "abstract" d
......@@ -128,27 +127,27 @@ toDoc ff d = do
let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
, _hd_doi = lookup "doi" d
, _hd_url = lookup "URL" d
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = lookup "title" d
, _hd_authors = Nothing
, _hd_institutes = lookup "authors" d
, _hd_source = lookup "source" d
, _hd_abstract = lookup "abstract" d
, _hd_publication_date = fmap (DT.pack . show) utcTime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (DT.pack . show) lang }
let (utcTime, (pub_year, pub_month, pub_day)) = Date.dateSplit lang dateToParse
HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
, _hd_doi = lookup "doi" d
, _hd_url = lookup "URL" d
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = lookup "title" d
, _hd_authors = Nothing
, _hd_institutes = lookup "authors" d
, _hd_source = lookup "source" d
, _hd_abstract = lookup "abstract" d
, _hd_publication_date = fmap (DT.pack . show) utcTime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (DT.pack . show) lang }
enrichWith :: FileFormat
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
......
......@@ -21,10 +21,12 @@ module Gargantext.Core.Text.Corpus.Parsers.Date
where
import Data.Aeson (toJSON, Value)
import Data.Either (Either(..))
import Data.HashMap.Strict as HM hiding (map)
import Data.Text (Text, unpack, splitOn)
import Data.Time (parseTimeOrError, defaultTimeLocale, toGregorian)
import Data.Time.Clock (UTCTime(..), getCurrentTime)
import qualified Data.Time.Calendar as DTC
import Data.Time.Clock (UTCTime(..), secondsToDiffTime)
import Data.Time.LocalTime (utc)
import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
import Duckling.Api (analyze)
......@@ -41,12 +43,12 @@ import qualified Duckling.Core as DC
------------------------------------------------------------------------
-- | Parse date to Ints
-- TODO add hours, minutes and seconds
dateSplit :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
dateSplit _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
dateSplit :: Lang -> Maybe Text -> (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
dateSplit _ Nothing = (Nothing, (Nothing, Nothing, Nothing))
dateSplit l (Just txt) = do
utcTime <- parse l txt
let utcTime = parse l txt
let (y, m, d) = split' utcTime
pure (Just utcTime, (Just y, Just m,Just d))
(Just utcTime, (Just y, Just m,Just d))
split' :: UTCTime -> (Year, Month, Day)
split' utcTime = (fromIntegral y, m, d)
......@@ -65,8 +67,8 @@ type Day = Int
-- 1979-04-10 19:00:00 UTC
-- >>> parseDate EN (pack "April 10 1979")
-- 1979-04-10 00:00:00 UTC
parse :: Lang -> Text -> IO UTCTime
parse lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
parse :: Lang -> Text -> UTCTime
parse = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0"
type DateFormat = Text
type DateDefault = Text
......@@ -75,17 +77,18 @@ parseDate' :: DateFormat
-> DateDefault
-> Lang
-> Text
-> IO UTCTime
-> UTCTime
parseDate' format def lang s = do
dateStr' <- parseRaw lang s
if dateStr' == ""
then getCurrentTime
else do
let dateStr' = parseRaw lang s
case dateStr' of
Left _err -> defaultUTCTime
Right "" -> defaultUTCTime
Right ds -> do
let dateStr = unpack
$ maybe def identity
$ head
$ splitOn "." dateStr'
pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr
$ splitOn "." ds
parseTimeOrError True defaultTimeLocale (unpack format) dateStr
-- TODO add Paris at Duckling.Locale Region datatype
......@@ -102,16 +105,15 @@ parserLang _ = panic "not implemented"
-- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
-- parseRaw :: Context -> Text -> SomeErrorHandling Text
parseRaw :: Lang -> Text -> IO Text
parseRaw :: Lang -> Text -> Either Text Text
parseRaw lang text = do -- case result
maybeResult <- extractValue <$> getTimeValue
<$> parseDateWithDuckling lang text (Options True)
let maybeResult = extractValue $ getTimeValue
$ parseDateWithDuckling lang text (Options True)
case maybeResult of
Just result -> pure result
Just result -> Right result
Nothing -> do
printDebug ("[G.C.T.C.P.D.parseRaw] ERROR " <> (cs . show) lang)
text
pure ""
-- printDebug ("[G.C.T.C.P.D.parseRaw] ERROR " <> (cs . show) lang) text
Left $ "[G.C.T.C.P.D.parseRaw ERROR] " <> (cs . show) lang <> " :: " <> text
getTimeValue :: [ResolvedToken] -> Maybe Value
getTimeValue rt = case head rt of
......@@ -136,15 +138,21 @@ utcToDucklingTime time = DucklingTime . zonedTimeToZoneSeriesTime $ fromUTC time
-- | Local Context which depends on Lang and Time
localContext :: Lang -> DucklingTime -> Context
localContext lang dt = Context {referenceTime = dt, locale = makeLocale (parserLang lang) Nothing}
localContext lang dt = Context { referenceTime = dt
, locale = makeLocale (parserLang lang) Nothing }
-- | Date parser with Duckling
parseDateWithDuckling :: Lang -> Text -> Options -> IO [ResolvedToken]
parseDateWithDuckling lang input options = do
contxt <- localContext lang <$> utcToDucklingTime <$> getCurrentTime
--pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
-- TODO check/test Options False or True
pure $ analyze input contxt options $ HashSet.fromList [(Seal Time)]
defaultDay :: DTC.Day
defaultDay = DTC.fromGregorian 1 1 1
defaultUTCTime :: UTCTime
defaultUTCTime = UTCTime { utctDay = defaultDay
, utctDayTime = secondsToDiffTime 0 }
-- | Date parser with Duckling
parseDateWithDuckling :: Lang -> Text -> Options -> [ResolvedToken]
parseDateWithDuckling lang input options = do
let contxt = localContext lang $ utcToDucklingTime defaultUTCTime
--pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
-- TODO check/test Options False or True
analyze input contxt options $ HashSet.fromList [(Seal Time)]
......@@ -68,8 +68,7 @@ wikiPageToDocument m wr = do
source = Nothing
abstract = Just $ concat $ take m sections
(date, (year, month, day))
<- dateSplit EN $ head
let (date, (year, month, day)) = dateSplit EN $ head
$ catMaybes
[ wr ^. wr_yearStart
, wr ^. wr_yearEnd
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment