Commit b87c1360 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge remote-tracking branch 'origin/97-dev-istex-search' into dev

parents 90e9cdf2 fe959c1d
......@@ -64,6 +64,7 @@ library
Gargantext.Database.Admin.Config
Gargantext.Database.Admin.Types.Hyperdata
Gargantext.Database.Admin.Types.Node
Gargantext.Defaults
Gargantext.Core.Text
Gargantext.Core.Text.Context
Gargantext.Core.Text.Corpus.Parsers
......
......@@ -88,6 +88,7 @@ library:
- Gargantext.Database.Admin.Config
- Gargantext.Database.Admin.Types.Hyperdata
- Gargantext.Database.Admin.Types.Node
- Gargantext.Defaults
- Gargantext.Core.Text
- Gargantext.Core.Text.Context
- Gargantext.Core.Text.Corpus.Parsers
......
......@@ -100,10 +100,9 @@ documentUpload nId doc = do
Just c -> c
Nothing -> panic $ T.pack $ "[G.A.N.DU] Node has no corpus parent: " <> show nId
(theFullDate, (year, month, day)) <- liftBase
$ dateSplit EN
$ Just
$ view du_date doc <> "T:0:0:0"
(theFullDate, (year, month, day)) <- liftBase $ dateSplit EN
$ Just
$ view du_date doc <> "T:0:0:0"
let hd = HyperdataDocument { _hd_bdd = Nothing
, _hd_doi = Nothing
......
......@@ -37,6 +37,7 @@ import Gargantext.Database.Admin.Types.Hyperdata.Frame
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Query.Table.Node (getChildrenByType, getClosestParentIdByType', getNodeWith)
import Gargantext.Database.Schema.Node (node_hyperdata)
import qualified Gargantext.Defaults as Defaults
import Gargantext.Prelude
import GHC.Generics (Generic)
import Servant
......@@ -114,9 +115,9 @@ hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, cont
date' = (\(Date { year, month, day }) -> T.concat [ T.pack $ show year, "-"
, T.pack $ show month, "-"
, T.pack $ show day ]) <$> date
year' = fromIntegral $ maybe 2021 (\(Date { year }) -> year) date
month' = fromIntegral $ maybe 10 (\(Date { month }) -> month) date
day' = fromIntegral $ maybe 4 (\(Date { day }) -> day) date in
year' = fromIntegral $ maybe Defaults.year (\(Date { year }) -> year) date
month' = maybe Defaults.month (\(Date { month }) -> fromIntegral month) date
day' = maybe Defaults.day (\(Date { day }) -> fromIntegral day) date in
Right HyperdataDocument { _hd_bdd = Just "FrameWrite"
, _hd_doi = Nothing
, _hd_url = Nothing
......
......@@ -31,6 +31,7 @@ import Gargantext.Database.Admin.Types.Hyperdata (HyperdataContact(..), Hyperdat
import Gargantext.Database.Admin.Types.Hyperdata.Contact (_cw_organization)
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Query.Facet
import qualified Gargantext.Defaults as Defaults
import Gargantext.Prelude
import Gargantext.Utils.Aeson (defaultTaggedObject)
import Servant
......@@ -258,12 +259,12 @@ instance ToHyperdataRow HyperdataDocument where
, _hr_language_iso2 = fromMaybe "EN" _hd_language_iso2
, _hr_page = fromMaybe 0 _hd_page
, _hr_publication_date = fromMaybe "" _hd_publication_date
, _hr_publication_day = fromMaybe 1 _hd_publication_day
, _hr_publication_hour = fromMaybe 1 _hd_publication_hour
, _hr_publication_minute = fromMaybe 1 _hd_publication_minute
, _hr_publication_month = fromMaybe 1 _hd_publication_month
, _hr_publication_second = fromMaybe 1 _hd_publication_second
, _hr_publication_year = fromMaybe 2020 _hd_publication_year
, _hr_publication_year = fromMaybe (fromIntegral Defaults.year) _hd_publication_year
, _hr_publication_month = fromMaybe Defaults.month _hd_publication_month
, _hr_publication_day = fromMaybe Defaults.day _hd_publication_day
, _hr_publication_hour = fromMaybe 0 _hd_publication_hour
, _hr_publication_minute = fromMaybe 0 _hd_publication_minute
, _hr_publication_second = fromMaybe 0 _hd_publication_second
, _hr_source = fromMaybe "" _hd_source
, _hr_title = fromMaybe "Title" _hd_title
, _hr_url = fromMaybe "" _hd_url
......
......@@ -15,8 +15,7 @@ module Gargantext.Core.Text.Corpus.API
, Limit
, get
, externalAPIs
)
where
) where
import Conduit
import Data.Either (Either(..))
......
......@@ -20,6 +20,7 @@ import Servant.Client (ClientError)
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import qualified Gargantext.Defaults as Defaults
import Gargantext.Prelude
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
import qualified HAL as HAL
......@@ -41,24 +42,25 @@ getC la q ml = do
toDoc' :: Lang -> HAL.Corpus -> IO HyperdataDocument
toDoc' la (HAL.Corpus i t ab d s aus affs struct_id) = do
(utctime, (pub_year, pub_month, pub_day)) <- Date.dateSplit la (maybe (Just "2019") Just d)
pure $ HyperdataDocument { _hd_bdd = Just "Hal"
, _hd_doi = Just $ pack $ show i
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = Just $ intercalate " " t
, _hd_authors = Just $ foldl (\x y -> x <> ", " <> y) "" aus
, _hd_institutes = Just $ foldl (\x y -> x <> ", " <> y) "" $ affs <> map (cs . show) struct_id
, _hd_source = Just $ maybe "Nothing" identity s
, _hd_abstract = Just $ intercalate " " ab
, _hd_publication_date = fmap (pack . show) utctime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (pack . show) la }
(utctime, (pub_year, pub_month, pub_day)) <-
Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) Just d)
pure HyperdataDocument { _hd_bdd = Just "Hal"
, _hd_doi = Just $ pack $ show i
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = Just $ intercalate " " t
, _hd_authors = Just $ foldl (\x y -> x <> ", " <> y) "" aus
, _hd_institutes = Just $ foldl (\x y -> x <> ", " <> y) "" $ affs <> map (cs . show) struct_id
, _hd_source = Just $ maybe "Nothing" identity s
, _hd_abstract = Just $ intercalate " " ab
, _hd_publication_date = fmap (pack . show) utctime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (pack . show) la }
......@@ -18,6 +18,7 @@ import Data.Text (Text)
import qualified Data.Text as Text
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import qualified Gargantext.Defaults as Defaults
import Gargantext.Prelude
import Isidore.Client
import Servant.Client
......@@ -67,26 +68,28 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
langText (OnlyText t2 ) = t2
langText (ArrayText ts ) = Text.intercalate " " $ map langText ts
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit l (maybe (Just "2019") (Just) d)
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit l (maybe (Just $ Text.pack $ show Defaults.year) (Just) d)
pure $ HyperdataDocument (Just "Isidore")
Nothing
u
Nothing
Nothing
Nothing
(Just $ cleanText $ langText t)
(creator2text <$> as)
Nothing
(Just $ maybe "Nothing" identity $ _sourceName <$> s)
(cleanText <$> langText <$> a)
(fmap (Text.pack . show) utcTime)
(pub_year)
(pub_month)
(pub_day)
Nothing
Nothing
Nothing
(Just $ (Text.pack . show) l)
pure HyperdataDocument
{ _hd_bdd = Just "Isidore"
, _hd_doi = Nothing
, _hd_url = u
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = Just $ cleanText $ langText t
, _hd_authors = creator2text <$> as
, _hd_institutes = Nothing
, _hd_source = Just $ maybe "Nothing" identity $ _sourceName <$> s
, _hd_abstract = cleanText <$> langText <$> a
, _hd_publication_date = fmap (Text.pack . show) utcTime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (Text.pack . show) l
}
......@@ -13,12 +13,14 @@ Portability : POSIX
module Gargantext.Core.Text.Corpus.API.Istex
where
import Data.Either (Either(..))
import Data.List (concat)
import Data.Maybe
import Data.Text (Text, pack)
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import qualified Gargantext.Defaults as Defaults
import Gargantext.Prelude
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
import qualified ISTEX as ISTEX
......@@ -26,19 +28,37 @@ import qualified ISTEX.Client as ISTEX
get :: Lang -> Text -> Maybe Integer -> IO [HyperdataDocument]
get la q ml = do
docs <- ISTEX.getMetadataWith q (fromIntegral <$> ml)
either (panic . pack . show) (toDoc' la) docs
--docs <- ISTEX.getMetadataWith q (fromIntegral <$> ml)
printDebug "[Istex.get] calling getMetadataScrollProgress for la" la
printDebug "[Istex.get] calling getMetadataScrollProgress for q" q
printDebug "[Istex.get] calling getMetadataScrollProgress for ml" ml
-- The "scroll" expects "d/h/m/s/ms" time interval. Let's set it to "1 month"
--eDocs <- ISTEX.getMetadataScroll q ((\_n -> pack $ "1m") <$> ml) Nothing 0 --(fromIntegral <$> ml)
eDocs <- ISTEX.getMetadataScroll q "1m" Nothing 0 --(fromIntegral <$> ml)
printDebug "[Istex.get] will print length" (0 :: Int)
case eDocs of
Left _ -> pure ()
Right (ISTEX.Documents { _documents_hits }) -> printDebug "[Istex.get] length docs" $ length _documents_hits
--ISTEX.getMetadataScrollProgress q ((\_ -> pack $ "1m") <$> ml) Nothing progress errorHandler
case eDocs of
Left err -> panic . pack . show $ err
Right docs -> toDoc' la docs
--pure $ either (panic . pack . show) (toDoc' la) eDocs
-- where
-- progress (ISTEX.ScrollResponse { _scroll_documents = ISTEX.Documents { _documents_hits }}) =
-- printDebug "[Istex.get] got docs: " $ length _documents_hits
-- errorHandler err = printDebug "[Istex.get] error" $ show err
toDoc' :: Lang -> ISTEX.Documents -> IO [HyperdataDocument]
toDoc' la docs' = do
toDoc' la docs' = mapM (toDoc la) (ISTEX._documents_hits docs')
--printDebug "ISTEX" (ISTEX._documents_total docs')
mapM (toDoc la) (ISTEX._documents_hits docs')
-- | TODO remove dateSplit here
-- TODO current year as default
toDoc :: Lang -> ISTEX.Document -> IO HyperdataDocument
toDoc la (ISTEX.Document i t a ab d s) = do
(utctime, (pub_year, pub_month, pub_day)) <- Date.dateSplit la (maybe (Just "2019") (Just . pack . show) d)
(utctime, (pub_year, pub_month, pub_day)) <-
Date.dateSplit la (maybe (Just $ pack $ show Defaults.year) (Just . pack . show) d)
pure $ HyperdataDocument { _hd_bdd = Just "Istex"
, _hd_doi = Just i
, _hd_url = Nothing
......
......@@ -164,6 +164,7 @@ parseFormatC _ _ _ = undefined
-- | Parse file into documents
-- TODO manage errors here
-- TODO: to debug maybe add the filepath in error message
parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseFile CsvHal Plain p = parseHal p
parseFile CsvGargV3 Plain p = parseCsv p
......@@ -185,27 +186,27 @@ toDoc ff d = do
let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
, _hd_doi = lookup "doi" d
, _hd_url = lookup "URL" d
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = lookup "title" d
, _hd_authors = Nothing
, _hd_institutes = lookup "authors" d
, _hd_source = lookup "source" d
, _hd_abstract = lookup "abstract" d
, _hd_publication_date = fmap (DT.pack . show) utcTime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (DT.pack . show) lang }
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
pure HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
, _hd_doi = lookup "doi" d
, _hd_url = lookup "URL" d
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = lookup "title" d
, _hd_authors = Nothing
, _hd_institutes = lookup "authors" d
, _hd_source = lookup "source" d
, _hd_abstract = lookup "abstract" d
, _hd_publication_date = fmap (DT.pack . show) utcTime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (DT.pack . show) lang }
enrichWith :: FileType
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
......
......@@ -25,8 +25,10 @@ import Data.Aeson (toJSON, Value)
import Data.Either (Either(..))
import Data.HashMap.Strict as HM hiding (map)
import Data.Text (Text, unpack, splitOn, replace)
import Data.Time (defaultTimeLocale, toGregorian, iso8601DateFormat, parseTimeM)
import Data.Time (defaultTimeLocale, iso8601DateFormat, parseTimeM, toGregorian)
import qualified Data.Time.Calendar as DTC
import Data.Time.Clock (UTCTime(..), getCurrentTime)
import Data.Time.Clock ( secondsToDiffTime)
import Data.Time.LocalTime (utc)
import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
import Duckling.Api (analyze)
......@@ -37,7 +39,7 @@ import Duckling.Types (ResolvedToken(..), ResolvedVal(..))
import Gargantext.Core (Lang(FR,EN))
import Gargantext.Core.Types (DebugMode(..), withDebugMode)
import Gargantext.Prelude
import qualified Control.Exception as CE
--import qualified Control.Exception as CE
import qualified Data.Aeson as Json
import qualified Data.HashSet as HashSet
import qualified Duckling.Core as DC
......@@ -136,28 +138,28 @@ parserLang lang = panic $ "[G.C.T.C.P.Date] Lang not implemented" <> (cs $ show
parseRawSafe :: Lang -> Text -> IO DateFlow
parseRawSafe lang text = do
triedParseRaw <- tryParseRaw lang text
let triedParseRaw = parseRaw lang text
dateStr' <- case triedParseRaw of
Left (CE.SomeException err) -> do
--Left (CE.SomeException err) -> do
Left err -> do
envLang <- getEnv "LANG"
printDebug "[G.C.T.C.P.Date] Exception: " (err, envLang, lang, text)
pure $ DucklingFailure text
Right res -> pure $ DucklingSuccess res
pure dateStr'
tryParseRaw :: CE.Exception e => Lang -> Text -> IO (Either e Text)
tryParseRaw lang text = CE.try (parseRaw lang text)
--tryParseRaw :: CE.Exception e => Lang -> Text -> IO (Either e Text)
--tryParseRaw lang text = CE.try (parseRaw lang text)
parseRaw :: Lang -> Text -> IO Text
parseRaw :: Lang -> Text -> Either Text Text
parseRaw lang text = do -- case result
maybeResult <- extractValue <$> getTimeValue
<$> parseDateWithDuckling lang text (Options True)
let maybeResult = extractValue $ getTimeValue
$ parseDateWithDuckling lang text (Options True)
case maybeResult of
Just result -> pure result
Just result -> Right result
Nothing -> do
printDebug ("[G.C.T.C.P.D.parseRaw] ERROR " <> (cs . show) lang)
text
pure ""
-- printDebug ("[G.C.T.C.P.D.parseRaw] ERROR " <> (cs . show) lang) text
Left $ "[G.C.T.C.P.D.parseRaw ERROR] " <> (cs . show) lang <> " :: " <> text
getTimeValue :: [ResolvedToken] -> Maybe Value
getTimeValue rt = case head rt of
......@@ -182,13 +184,21 @@ utcToDucklingTime time = DucklingTime . zonedTimeToZoneSeriesTime $ fromUTC time
-- | Local Context which depends on Lang and Time
localContext :: Lang -> DucklingTime -> Context
localContext lang dt = Context {referenceTime = dt, locale = makeLocale (parserLang lang) Nothing}
localContext lang dt = Context { referenceTime = dt
, locale = makeLocale (parserLang lang) Nothing }
defaultDay :: DTC.Day
defaultDay = DTC.fromGregorian 1 1 1
defaultUTCTime :: UTCTime
defaultUTCTime = UTCTime { utctDay = defaultDay
, utctDayTime = secondsToDiffTime 0 }
-- | Date parser with Duckling
parseDateWithDuckling :: Lang -> Text -> Options -> IO [ResolvedToken]
parseDateWithDuckling :: Lang -> Text -> Options -> [ResolvedToken]
parseDateWithDuckling lang input options = do
contxt <- localContext lang <$> utcToDucklingTime <$> getCurrentTime
--pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
-- TODO check/test Options False or True
pure $ analyze input contxt options $ HashSet.fromList [(Seal Time)]
let contxt = localContext lang $ utcToDucklingTime defaultUTCTime
--pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
-- TODO check/test Options False or True
analyze input contxt options $ HashSet.fromList [(Seal Time)]
......@@ -23,6 +23,7 @@ import Data.Aeson.TH (deriveJSON)
import Data.ByteString.Lazy (readFile)
import Data.Text (Text, unpack)
import Gargantext.Core.Utils.Prefix (unPrefix)
import qualified Gargantext.Defaults as Defaults
import Gargantext.Prelude
import System.IO (FilePath)
import Gargantext.Core.Text.Corpus.Parsers.CSV (CsvDoc(..), writeFile, headerCsvGargV3)
......@@ -52,8 +53,8 @@ patent2csvDoc (Patent { .. }) =
CsvDoc { csv_title = _patent_title
, csv_source = "Source"
, csv_publication_year = Just $ read (unpack _patent_year)
, csv_publication_month = Just 1
, csv_publication_day = Just 1
, csv_publication_month = Just $ Defaults.month
, csv_publication_day = Just $ Defaults.day
, csv_abstract = _patent_abstract
, csv_authors = "Authors" }
......
......@@ -68,8 +68,7 @@ wikiPageToDocument m wr = do
source = Nothing
abstract = Just $ concat $ take m sections
(date, (year, month, day))
<- dateSplit EN $ head
(date, (year, month, day)) <- dateSplit EN $ head
$ catMaybes
[ wr ^. wr_yearStart
, wr ^. wr_yearEnd
......
......@@ -74,6 +74,7 @@ import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Prelude (Cmd, runPGSQuery{-, formatPGSQuery-})
import Gargantext.Database.Schema.Node (NodePoly(..))
import qualified Gargantext.Defaults as Defaults
import Gargantext.Prelude
import Gargantext.Prelude.Crypto.Hash (hash)
import qualified Data.Text as DT (pack, concat, take)
......@@ -282,9 +283,9 @@ instance ToNode HyperdataDocument where
-- NOTE: There is no year '0' in postgres, there is year 1 AD and beofre that year 1 BC:
-- select '0001-01-01'::date, '0001-01-01'::date - '1 day'::interval;
-- 0001-01-01 0001-12-31 00:00:00 BC
y = maybe 1 fromIntegral $ _hd_publication_year h
m = fromMaybe 1 $ _hd_publication_month h
d = fromMaybe 1 $ _hd_publication_day h
y = fromIntegral $ fromMaybe Defaults.day $ _hd_publication_year h
m = fromMaybe Defaults.month $ _hd_publication_month h
d = fromMaybe (fromIntegral Defaults.year) $ _hd_publication_day h
-- TODO better Node
instance ToNode HyperdataContact where
......
{-|
Module : Gargantext.Defaults
Description : Gargantext default values
Copyright : (c) CNRS, 2021-present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Defaults
where
import Gargantext.Prelude
year :: Integer
year = 1
month :: Int
month = 1
day :: Int
day = 1
......@@ -73,7 +73,7 @@ extra-deps:
- git: https://gitlab.iscpif.fr/gargantext/crawlers/pubmed.git
commit: 02e03d9b856bd35d391f43da8525330f9d184615
- git: https://gitlab.iscpif.fr/gargantext/crawlers/istex.git
commit: daeae80365250c4bd539f0a65e271f9aa37f731f
commit: a4a6fb6a578255c9e5b52aab2afccf874976a3f5
- git: https://gitlab.iscpif.fr/gargantext/crawlers/hal.git
commit: 9a43470241690a19c1c381c42a62c5dd4e28dff2
- git: https://gitlab.iscpif.fr/gargantext/crawlers/isidore.git
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment