[refactor] simplify dateSplit (no lang needed)

Also, use ISO639 languages in some places.
parent 9b4bef67
Pipeline #4496 failed with stages
in 10 minutes and 16 seconds
...@@ -954,3 +954,4 @@ test-suite garg-test ...@@ -954,3 +954,4 @@ test-suite garg-test
, unordered-containers ^>= 0.2.16.0 , unordered-containers ^>= 0.2.16.0
, validity ^>= 0.11.0.1 , validity ^>= 0.11.0.1
default-language: Haskell2010 default-language: Haskell2010
...@@ -93,7 +93,7 @@ documentUpload nId doc = do ...@@ -93,7 +93,7 @@ documentUpload nId doc = do
Just c -> c Just c -> c
Nothing -> panic $ T.pack $ "[G.A.N.DU] Node has no corpus parent: " <> show nId Nothing -> panic $ T.pack $ "[G.A.N.DU] Node has no corpus parent: " <> show nId
(theFullDate, (year, month, day)) <- liftBase $ dateSplit EN (theFullDate, (year, month, day)) <- liftBase $ dateSplit
$ Just $ Just
$ view du_date doc $ view du_date doc
......
...@@ -17,6 +17,7 @@ module Gargantext.Core ...@@ -17,6 +17,7 @@ module Gargantext.Core
import Data.Aeson import Data.Aeson
import Data.Either(Either(Left)) import Data.Either(Either(Left))
import Data.Hashable (Hashable) import Data.Hashable (Hashable)
import Data.LanguageCodes qualified as ISO639
import Data.Maybe (fromMaybe) import Data.Maybe (fromMaybe)
import Data.Morpheus.Types (GQLType) import Data.Morpheus.Types (GQLType)
import Data.Swagger import Data.Swagger
...@@ -45,6 +46,7 @@ import qualified Data.Map as Map ...@@ -45,6 +46,7 @@ import qualified Data.Map as Map
-- | All languages supported -- | All languages supported
-- NOTE: Use international country codes -- NOTE: Use international country codes
-- https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes -- https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes
-- TODO This should be deprecated in favor of iso-639 library
data Lang = All data Lang = All
| DE | DE
| EL | EL
...@@ -93,6 +95,23 @@ instance Hashable Lang ...@@ -93,6 +95,23 @@ instance Hashable Lang
instance Arbitrary Lang where instance Arbitrary Lang where
arbitrary = arbitraryBoundedEnum arbitrary = arbitraryBoundedEnum
toISO639 :: Lang -> Maybe ISO639.ISO639_1
toISO639 DE = Just ISO639.DE
toISO639 EL = Just ISO639.EL
toISO639 EN = Just ISO639.EN
toISO639 ES = Just ISO639.ES
toISO639 FR = Just ISO639.FR
toISO639 IT = Just ISO639.IT
toISO639 PL = Just ISO639.PL
toISO639 PT = Just ISO639.PT
toISO639 RU = Just ISO639.RU
toISO639 UK = Just ISO639.UK
toISO639 ZH = Just ISO639.ZH
toISO639 All = Nothing
toISO639EN :: Lang -> ISO639.ISO639_1
toISO639EN l = fromMaybe ISO639.EN $ toISO639 l
-- | https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes -- | https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
toISO639Lang :: Lang -> Maybe Text toISO639Lang :: Lang -> Maybe Text
toISO639Lang All = Nothing toISO639Lang All = Nothing
......
...@@ -24,7 +24,7 @@ import Data.Either (Either(..)) ...@@ -24,7 +24,7 @@ import Data.Either (Either(..))
import Data.Maybe import Data.Maybe
import qualified Data.Text as T import qualified Data.Text as T
import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs) import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs)
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..), toISO639)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude import Gargantext.Prelude
import qualified Gargantext.Core.Text.Corpus.API.Arxiv as Arxiv import qualified Gargantext.Core.Text.Corpus.API.Arxiv as Arxiv
...@@ -63,7 +63,7 @@ get externalAPI la q mPubmedAPIKey limit = do ...@@ -63,7 +63,7 @@ get externalAPI la q mPubmedAPIKey limit = do
--docs <- PUBMED.get q default_limit -- EN only by default --docs <- PUBMED.get q default_limit -- EN only by default
--pure (Just $ fromIntegral $ length docs, yieldMany docs) --pure (Just $ fromIntegral $ length docs, yieldMany docs)
Arxiv -> Right <$> Arxiv.get la corpusQuery limit Arxiv -> Right <$> Arxiv.get la corpusQuery limit
HAL -> first ExternalAPIError <$> HAL.getC la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit) HAL -> first ExternalAPIError <$> HAL.getC (toISO639 la) (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
IsTex -> do docs <- ISTEX.get la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit) IsTex -> do docs <- ISTEX.get la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs) pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
Isidore -> do docs <- ISIDORE.get la (Corpus.getLimit <$> limit) (Just $ Corpus.getRawQuery q) Nothing Isidore -> do docs <- ISIDORE.get la (Corpus.getLimit <$> limit) (Just $ Corpus.getRawQuery q) Nothing
......
...@@ -18,7 +18,7 @@ import Data.LanguageCodes qualified as ISO639 ...@@ -18,7 +18,7 @@ import Data.LanguageCodes qualified as ISO639
import Data.Map.Strict qualified as Map import Data.Map.Strict qualified as Map
import Data.Maybe import Data.Maybe
import Data.Text (Text, pack, intercalate) import Data.Text (Text, pack, intercalate)
import Gargantext.Core (Lang(..)) -- import Gargantext.Core (Lang(..), toISO639Lang)
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Defaults qualified as Defaults import Gargantext.Defaults qualified as Defaults
...@@ -28,40 +28,26 @@ import HAL.Client qualified as HAL ...@@ -28,40 +28,26 @@ import HAL.Client qualified as HAL
import HAL.Doc.Corpus qualified as HAL import HAL.Doc.Corpus qualified as HAL
import Servant.Client (ClientError) import Servant.Client (ClientError)
toLang :: Lang -> Maybe ISO639.ISO639_1 get :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO [HyperdataDocument]
toLang DE = Just ISO639.DE
toLang EL = Just ISO639.EL
toLang EN = Just ISO639.EN
toLang ES = Just ISO639.ES
toLang FR = Just ISO639.FR
toLang IT = Just ISO639.IT
toLang PL = Just ISO639.PL
toLang PT = Just ISO639.PT
toLang RU = Just ISO639.RU
toLang UK = Just ISO639.UK
toLang ZH = Just ISO639.ZH
toLang All = Nothing
get :: Lang -> Text -> Maybe Int -> IO [HyperdataDocument]
get la q ml = do get la q ml = do
eDocs <- HAL.getMetadataWith [q] (Just 0) (fromIntegral <$> ml) (toLang la) eDocs <- HAL.getMetadataWith [q] (Just 0) (fromIntegral <$> ml) la
either (panic . pack . show) (\d -> mapM (toDoc' la) $ HAL._docs d) eDocs either (panic . pack . show) (\d -> mapM (toDoc' la) $ HAL._docs d) eDocs
getC :: Lang -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ())) getC :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC la q ml = do getC la q ml = do
eRes <- HAL.getMetadataWithC [q] (Just 0) (fromIntegral <$> ml) (toLang la) eRes <- HAL.getMetadataWithC [q] (Just 0) (fromIntegral <$> ml) la
pure $ (\(len, docsC) -> (len, docsC .| mapMC (toDoc' la))) <$> eRes pure $ (\(len, docsC) -> (len, docsC .| mapMC (toDoc' la))) <$> eRes
-- case eRes of -- case eRes of
-- Left err -> panic $ pack $ show err -- Left err -> panic $ pack $ show err
-- Right (len, docsC) -> pure (len, docsC .| mapMC (toDoc' la)) -- Right (len, docsC) -> pure (len, docsC .| mapMC (toDoc' la))
toDoc' :: Lang -> HAL.Corpus -> IO HyperdataDocument toDoc' :: Maybe ISO639.ISO639_1 -> HAL.Corpus -> IO HyperdataDocument
toDoc' la (HAL.Corpus { .. }) = do toDoc' la (HAL.Corpus { .. }) = do
-- printDebug "[toDoc corpus] h" h -- printDebug "[toDoc corpus] h" h
(utctime, (pub_year, pub_month, pub_day)) <- (utctime, (pub_year, pub_month, pub_day)) <-
Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) Just _corpus_date) Date.dateSplit (maybe (Just $ pack $ show Defaults.year) Just _corpus_date)
let abstractDefault = intercalate " " _corpus_abstract let abstractDefault = intercalate " " _corpus_abstract
let abstract = case toLang la of let abstract = case la of
Nothing -> abstractDefault Nothing -> abstractDefault
Just l -> fromMaybe abstractDefault (intercalate " " <$> Map.lookup l _corpus_abstract_lang_map) Just l -> fromMaybe abstractDefault (intercalate " " <$> Map.lookup l _corpus_abstract_lang_map)
pure HyperdataDocument { _hd_bdd = Just "Hal" pure HyperdataDocument { _hd_bdd = Just "Hal"
......
...@@ -35,13 +35,13 @@ get la l q a = do ...@@ -35,13 +35,13 @@ get la l q a = do
let let
printErr (DecodeFailure e _) = panic e printErr (DecodeFailure e _) = panic e
printErr e = panic (cs $ show e) printErr e = panic (cs $ show e)
toIsidoreDocs :: Reply -> [IsidoreDoc] toIsidoreDocs :: Reply -> [IsidoreDoc]
toIsidoreDocs (ReplyOnly r) = [r] toIsidoreDocs (ReplyOnly r) = [r]
toIsidoreDocs (Replies rs) = rs toIsidoreDocs (Replies rs) = rs
iDocs <- either printErr _content <$> Isidore.get l q a iDocs <- either printErr _content <$> Isidore.get l q a
hDocs <- mapM (\d -> isidoreToDoc la d) (toIsidoreDocs iDocs) hDocs <- mapM (\d -> isidoreToDoc la d) (toIsidoreDocs iDocs)
pure hDocs pure hDocs
...@@ -58,7 +58,7 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do ...@@ -58,7 +58,7 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
author :: Author -> Text author :: Author -> Text
author (Author fn ln) = (_name fn) <> ", " <> (_name ln) author (Author fn ln) = (_name fn) <> ", " <> (_name ln)
author (Authors aus) = Text.intercalate ". " $ map author aus author (Authors aus) = Text.intercalate ". " $ map author aus
creator2text :: Creator -> Text creator2text :: Creator -> Text
creator2text (Creator au) = author au creator2text (Creator au) = author au
creator2text (Creators aus') = Text.intercalate ". " $ map author aus' creator2text (Creators aus') = Text.intercalate ". " $ map author aus'
...@@ -67,9 +67,9 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do ...@@ -67,9 +67,9 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
langText (LangText _l t1) = t1 langText (LangText _l t1) = t1
langText (OnlyText t2 ) = t2 langText (OnlyText t2 ) = t2
langText (ArrayText ts ) = Text.intercalate " " $ map langText ts langText (ArrayText ts ) = Text.intercalate " " $ map langText ts
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit l (maybe (Just $ Text.pack $ show Defaults.year) (Just) d) (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit (maybe (Just $ Text.pack $ show Defaults.year) (Just) d)
pure HyperdataDocument pure HyperdataDocument
{ _hd_bdd = Just "Isidore" { _hd_bdd = Just "Isidore"
, _hd_doi = Nothing , _hd_doi = Nothing
...@@ -91,5 +91,3 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do ...@@ -91,5 +91,3 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
, _hd_publication_second = Nothing , _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (Text.pack . show) l , _hd_language_iso2 = Just $ (Text.pack . show) l
} }
...@@ -82,7 +82,7 @@ toDoc :: Lang -> ISTEX.Document -> IO HyperdataDocument ...@@ -82,7 +82,7 @@ toDoc :: Lang -> ISTEX.Document -> IO HyperdataDocument
toDoc la (ISTEX.Document i t a ab d s) = do toDoc la (ISTEX.Document i t a ab d s) = do
--printDebug "ISTEX date" d --printDebug "ISTEX date" d
(utctime, (pub_year, pub_month, pub_day)) <- (utctime, (pub_year, pub_month, pub_day)) <-
Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) (Just . pack . show) d) Date.dateSplit (maybe (Just $ pack $ show Defaults.year) (Just . pack . show) d)
--printDebug "toDoc Istex" (utctime, (pub_year, pub_month, pub_day)) --printDebug "toDoc Istex" (utctime, (pub_year, pub_month, pub_day))
pure $ HyperdataDocument { _hd_bdd = Just "Istex" pure $ HyperdataDocument { _hd_bdd = Just "Istex"
, _hd_doi = Just i , _hd_doi = Just i
......
...@@ -236,7 +236,7 @@ toDoc ff d = do ...@@ -236,7 +236,7 @@ toDoc ff d = do
let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
-- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse -- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit dateToParse
let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
, _hd_doi = lookup "doi" d , _hd_doi = lookup "doi" d
......
...@@ -48,10 +48,10 @@ import qualified Data.List as List ...@@ -48,10 +48,10 @@ import qualified Data.List as List
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Parse date to Ints -- | Parse date to Ints
-- TODO add hours, minutes and seconds -- TODO add hours, minutes and seconds
dateSplit :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day)) dateSplit :: Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
dateSplit _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing)) dateSplit Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
dateSplit l (Just txt) = do dateSplit (Just txt) = do
utcTime <- parse l txt utcTime <- parse txt
let (y, m, d) = split' utcTime let (y, m, d) = split' utcTime
pure (Just utcTime, (Just y, Just m,Just d)) pure (Just utcTime, (Just y, Just m,Just d))
...@@ -72,15 +72,15 @@ type Day = Int ...@@ -72,15 +72,15 @@ type Day = Int
-- 1900-04-01 19:00:00 UTC -- 1900-04-01 19:00:00 UTC
-- >>> parse EN (pack "April 1 1900") -- >>> parse EN (pack "April 1 1900")
-- 1900-04-01 00:00:00 UTC -- 1900-04-01 00:00:00 UTC
parse :: Lang -> Text -> IO UTCTime parse :: Text -> IO UTCTime
parse lang s = do parse s = do
-- printDebug "Date: " s -- printDebug "Date: " s
let result = dateFlow (DucklingFailure s) let result = dateFlow (DucklingFailure s)
--printDebug "Date': " dateStr' --printDebug "Date': " dateStr'
case result of case result of
DateFlowSuccess ok -> pure ok DateFlowSuccess ok -> pure ok
DateFlowFailure -> (withDebugMode (DebugMode True) DateFlowFailure -> (withDebugMode (DebugMode True)
"[G.C.T.P.T.Date parse]" (lang,s) "[G.C.T.P.T.Date parse]" s
$ getCurrentTime) $ getCurrentTime)
_ -> panic "[G.C.T.C.Parsers.Date] parse: Should not happen" _ -> panic "[G.C.T.C.Parsers.Date] parse: Should not happen"
...@@ -206,4 +206,3 @@ parseDateWithDuckling lang input options = do ...@@ -206,4 +206,3 @@ parseDateWithDuckling lang input options = do
--pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx --pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
-- TODO check/test Options False or True -- TODO check/test Options False or True
analyze input contxt options $ HashSet.fromList [(Seal Time)] analyze input contxt options $ HashSet.fromList [(Seal Time)]
{-| {-|
Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata
<<<<<<< HEAD <<<<<<< HEAD
Description : To query Wikidata Description : To query Wikidata
======= =======
Description : To query Wikidata Description : To query Wikidata
>>>>>>> dev-clustering >>>>>>> dev-clustering
...@@ -68,7 +68,7 @@ wikiPageToDocument m wr = do ...@@ -68,7 +68,7 @@ wikiPageToDocument m wr = do
source = Nothing source = Nothing
abstract = Just $ concat $ take m sections abstract = Just $ concat $ take m sections
(date, (year, month, day)) <- dateSplit EN $ head (date, (year, month, day)) <- dateSplit $ head
$ catMaybes $ catMaybes
[ wr ^. wr_yearStart [ wr ^. wr_yearStart
, wr ^. wr_yearEnd , wr ^. wr_yearEnd
...@@ -130,4 +130,3 @@ wikidataQuery n = List.unlines ...@@ -130,4 +130,3 @@ wikidataQuery n = List.unlines
," }" ," }"
," LIMIT " <> (cs $ show n) ," LIMIT " <> (cs $ show n)
] ]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment