[refactor] simplify dateSplit (no lang needed)

Also, use ISO639 languages in some places.
parent 9b4bef67
Pipeline #4496 failed with stages
in 10 minutes and 16 seconds
......@@ -954,3 +954,4 @@ test-suite garg-test
, unordered-containers ^>= 0.2.16.0
, validity ^>= 0.11.0.1
default-language: Haskell2010
......@@ -93,7 +93,7 @@ documentUpload nId doc = do
Just c -> c
Nothing -> panic $ T.pack $ "[G.A.N.DU] Node has no corpus parent: " <> show nId
(theFullDate, (year, month, day)) <- liftBase $ dateSplit EN
(theFullDate, (year, month, day)) <- liftBase $ dateSplit
$ Just
$ view du_date doc
......
......@@ -17,6 +17,7 @@ module Gargantext.Core
import Data.Aeson
import Data.Either(Either(Left))
import Data.Hashable (Hashable)
import Data.LanguageCodes qualified as ISO639
import Data.Maybe (fromMaybe)
import Data.Morpheus.Types (GQLType)
import Data.Swagger
......@@ -45,6 +46,7 @@ import qualified Data.Map as Map
-- | All languages supported
-- NOTE: Use international country codes
-- https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes
-- TODO This should be deprecated in favor of iso-639 library
data Lang = All
| DE
| EL
......@@ -93,6 +95,23 @@ instance Hashable Lang
instance Arbitrary Lang where
arbitrary = arbitraryBoundedEnum
toISO639 :: Lang -> Maybe ISO639.ISO639_1
toISO639 DE = Just ISO639.DE
toISO639 EL = Just ISO639.EL
toISO639 EN = Just ISO639.EN
toISO639 ES = Just ISO639.ES
toISO639 FR = Just ISO639.FR
toISO639 IT = Just ISO639.IT
toISO639 PL = Just ISO639.PL
toISO639 PT = Just ISO639.PT
toISO639 RU = Just ISO639.RU
toISO639 UK = Just ISO639.UK
toISO639 ZH = Just ISO639.ZH
toISO639 All = Nothing
toISO639EN :: Lang -> ISO639.ISO639_1
toISO639EN l = fromMaybe ISO639.EN $ toISO639 l
-- | https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
toISO639Lang :: Lang -> Maybe Text
toISO639Lang All = Nothing
......
......@@ -24,7 +24,7 @@ import Data.Either (Either(..))
import Data.Maybe
import qualified Data.Text as T
import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs)
import Gargantext.Core (Lang(..))
import Gargantext.Core (Lang(..), toISO639)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import qualified Gargantext.Core.Text.Corpus.API.Arxiv as Arxiv
......@@ -63,7 +63,7 @@ get externalAPI la q mPubmedAPIKey limit = do
--docs <- PUBMED.get q default_limit -- EN only by default
--pure (Just $ fromIntegral $ length docs, yieldMany docs)
Arxiv -> Right <$> Arxiv.get la corpusQuery limit
HAL -> first ExternalAPIError <$> HAL.getC la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
HAL -> first ExternalAPIError <$> HAL.getC (toISO639 la) (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
IsTex -> do docs <- ISTEX.get la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
Isidore -> do docs <- ISIDORE.get la (Corpus.getLimit <$> limit) (Just $ Corpus.getRawQuery q) Nothing
......
......@@ -18,7 +18,7 @@ import Data.LanguageCodes qualified as ISO639
import Data.Map.Strict qualified as Map
import Data.Maybe
import Data.Text (Text, pack, intercalate)
import Gargantext.Core (Lang(..))
-- import Gargantext.Core (Lang(..), toISO639Lang)
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Defaults qualified as Defaults
......@@ -28,40 +28,26 @@ import HAL.Client qualified as HAL
import HAL.Doc.Corpus qualified as HAL
import Servant.Client (ClientError)
toLang :: Lang -> Maybe ISO639.ISO639_1
toLang DE = Just ISO639.DE
toLang EL = Just ISO639.EL
toLang EN = Just ISO639.EN
toLang ES = Just ISO639.ES
toLang FR = Just ISO639.FR
toLang IT = Just ISO639.IT
toLang PL = Just ISO639.PL
toLang PT = Just ISO639.PT
toLang RU = Just ISO639.RU
toLang UK = Just ISO639.UK
toLang ZH = Just ISO639.ZH
toLang All = Nothing
get :: Lang -> Text -> Maybe Int -> IO [HyperdataDocument]
get :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO [HyperdataDocument]
get la q ml = do
eDocs <- HAL.getMetadataWith [q] (Just 0) (fromIntegral <$> ml) (toLang la)
eDocs <- HAL.getMetadataWith [q] (Just 0) (fromIntegral <$> ml) la
either (panic . pack . show) (\d -> mapM (toDoc' la) $ HAL._docs d) eDocs
getC :: Lang -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC la q ml = do
eRes <- HAL.getMetadataWithC [q] (Just 0) (fromIntegral <$> ml) (toLang la)
eRes <- HAL.getMetadataWithC [q] (Just 0) (fromIntegral <$> ml) la
pure $ (\(len, docsC) -> (len, docsC .| mapMC (toDoc' la))) <$> eRes
-- case eRes of
-- Left err -> panic $ pack $ show err
-- Right (len, docsC) -> pure (len, docsC .| mapMC (toDoc' la))
toDoc' :: Lang -> HAL.Corpus -> IO HyperdataDocument
toDoc' :: Maybe ISO639.ISO639_1 -> HAL.Corpus -> IO HyperdataDocument
toDoc' la (HAL.Corpus { .. }) = do
-- printDebug "[toDoc corpus] h" h
(utctime, (pub_year, pub_month, pub_day)) <-
Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) Just _corpus_date)
Date.dateSplit (maybe (Just $ pack $ show Defaults.year) Just _corpus_date)
let abstractDefault = intercalate " " _corpus_abstract
let abstract = case toLang la of
let abstract = case la of
Nothing -> abstractDefault
Just l -> fromMaybe abstractDefault (intercalate " " <$> Map.lookup l _corpus_abstract_lang_map)
pure HyperdataDocument { _hd_bdd = Just "Hal"
......
......@@ -35,13 +35,13 @@ get la l q a = do
let
printErr (DecodeFailure e _) = panic e
printErr e = panic (cs $ show e)
toIsidoreDocs :: Reply -> [IsidoreDoc]
toIsidoreDocs (ReplyOnly r) = [r]
toIsidoreDocs (Replies rs) = rs
iDocs <- either printErr _content <$> Isidore.get l q a
hDocs <- mapM (\d -> isidoreToDoc la d) (toIsidoreDocs iDocs)
pure hDocs
......@@ -58,7 +58,7 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
author :: Author -> Text
author (Author fn ln) = (_name fn) <> ", " <> (_name ln)
author (Authors aus) = Text.intercalate ". " $ map author aus
creator2text :: Creator -> Text
creator2text (Creator au) = author au
creator2text (Creators aus') = Text.intercalate ". " $ map author aus'
......@@ -67,9 +67,9 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
langText (LangText _l t1) = t1
langText (OnlyText t2 ) = t2
langText (ArrayText ts ) = Text.intercalate " " $ map langText ts
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit l (maybe (Just $ Text.pack $ show Defaults.year) (Just) d)
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit (maybe (Just $ Text.pack $ show Defaults.year) (Just) d)
pure HyperdataDocument
{ _hd_bdd = Just "Isidore"
, _hd_doi = Nothing
......@@ -91,5 +91,3 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (Text.pack . show) l
}
......@@ -82,7 +82,7 @@ toDoc :: Lang -> ISTEX.Document -> IO HyperdataDocument
toDoc la (ISTEX.Document i t a ab d s) = do
--printDebug "ISTEX date" d
(utctime, (pub_year, pub_month, pub_day)) <-
Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) (Just . pack . show) d)
Date.dateSplit (maybe (Just $ pack $ show Defaults.year) (Just . pack . show) d)
--printDebug "toDoc Istex" (utctime, (pub_year, pub_month, pub_day))
pure $ HyperdataDocument { _hd_bdd = Just "Istex"
, _hd_doi = Just i
......
......@@ -236,7 +236,7 @@ toDoc ff d = do
let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
-- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit dateToParse
let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
, _hd_doi = lookup "doi" d
......
......@@ -48,10 +48,10 @@ import qualified Data.List as List
------------------------------------------------------------------------
-- | Parse date to Ints
-- TODO add hours, minutes and seconds
dateSplit :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
dateSplit _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
dateSplit l (Just txt) = do
utcTime <- parse l txt
dateSplit :: Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
dateSplit Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
dateSplit (Just txt) = do
utcTime <- parse txt
let (y, m, d) = split' utcTime
pure (Just utcTime, (Just y, Just m,Just d))
......@@ -72,15 +72,15 @@ type Day = Int
-- 1900-04-01 19:00:00 UTC
-- >>> parse EN (pack "April 1 1900")
-- 1900-04-01 00:00:00 UTC
parse :: Lang -> Text -> IO UTCTime
parse lang s = do
parse :: Text -> IO UTCTime
parse s = do
-- printDebug "Date: " s
let result = dateFlow (DucklingFailure s)
--printDebug "Date': " dateStr'
case result of
DateFlowSuccess ok -> pure ok
DateFlowFailure -> (withDebugMode (DebugMode True)
"[G.C.T.P.T.Date parse]" (lang,s)
"[G.C.T.P.T.Date parse]" s
$ getCurrentTime)
_ -> panic "[G.C.T.C.Parsers.Date] parse: Should not happen"
......@@ -206,4 +206,3 @@ parseDateWithDuckling lang input options = do
--pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
-- TODO check/test Options False or True
analyze input contxt options $ HashSet.fromList [(Seal Time)]
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata
<<<<<<< HEAD
Description : To query Wikidata
Description : To query Wikidata
=======
Description : To query Wikidata
>>>>>>> dev-clustering
......@@ -68,7 +68,7 @@ wikiPageToDocument m wr = do
source = Nothing
abstract = Just $ concat $ take m sections
(date, (year, month, day)) <- dateSplit EN $ head
(date, (year, month, day)) <- dateSplit $ head
$ catMaybes
[ wr ^. wr_yearStart
, wr ^. wr_yearEnd
......@@ -130,4 +130,3 @@ wikidataQuery n = List.unlines
," }"
," LIMIT " <> (cs $ show n)
]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment