Commit 46d254d9 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch 'dev-merge' into dev

parents 0322ffc8 bdd5deec
......@@ -2,7 +2,7 @@
# https://vadosware.io/post/zero-to-continuous-integrated-testing-a-haskell-project-with-gitlab/
#
#
image: cgenie/stack-build:lts-17.13-garg
image: cgenie/stack-build:lts-18.18-garg
variables:
STACK_ROOT: "${CI_PROJECT_DIR}/.stack-root"
......
......@@ -246,6 +246,7 @@ library:
- timezone-series
- transformers
- transformers-base
- tuple
- unordered-containers
- utf8-string
- uuid
......
......@@ -200,7 +200,7 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q
Just Web -> do
printDebug "[addToCorpusWithQuery] processing web request" datafield
_ <- triggerSearxSearch cid q l
_ <- triggerSearxSearch user cid q l logStatus
pure JobLog { _scst_succeeded = Just 3
, _scst_failed = Just 0
......
......@@ -7,32 +7,60 @@ module Gargantext.API.Node.Corpus.Searx where
import Control.Lens (view)
import qualified Data.Aeson as Aeson
import Data.Aeson.TH (deriveJSON)
import Data.Either (Either(..))
import qualified Data.HashMap.Strict as HashMap
import qualified Data.Text as T
import Data.Time.Calendar (Day, toGregorian)
import Data.Time.Format (defaultTimeLocale, formatTime, parseTimeM)
import Data.Tuple.Select (sel1, sel2, sel3)
import GHC.Generics (Generic)
import Network.HTTP.Client
import Network.HTTP.Client.TLS
import qualified Prelude as Prelude
import Protolude (encodeUtf8, Text, Either)
import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
import Gargantext.Prelude
import Gargantext.Prelude.Config
import Gargantext.Core (Lang(..))
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
--import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Job (jobLogSuccess)
import Gargantext.Core (Lang(..), PosTagAlgo(..))
import qualified Gargantext.Core.Text.Corpus.API as API
import Gargantext.Core.Text.List (buildNgramsLists)
import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
import Gargantext.Core.Text.Terms (TermType(..))
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Database.Action.Flow (insertMasterDocs) --, DataText(..))
import Gargantext.Database.Action.Flow.List (flowList_DbRepo)
import Gargantext.Database.Action.Flow.Types (FlowCmdM)
import Gargantext.Database.Admin.Types.Node (CorpusId)
import Gargantext.Database.Action.User (getUserId)
import Gargantext.Database.Admin.Config (userMaster)
import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus)
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Database.Admin.Types.Node (CorpusId, ListId)
import Gargantext.Database.Prelude (hasConfig)
import Gargantext.Database.Query.Table.Node (defaultListMaybe, getOrMkList)
import Gargantext.Database.Query.Tree.Root (getOrMk_RootWithCorpus)
import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
langToSearx :: Lang -> Text
langToSearx EN = "en-US"
langToSearx FR = "fr-FR"
langToSearx All = "en-US"
data SearxResult = SearxResult
{ _sr_url :: Text
, _sr_title :: Text
, _sr_content :: Maybe Text
, _sr_engine :: Text
, _sr_score :: Double
, _sr_category :: Text
, _sr_pretty_url :: Text }
{ _sr_url :: Text
, _sr_title :: Text
, _sr_content :: Maybe Text
, _sr_engine :: Text
, _sr_score :: Double
, _sr_category :: Text
, _sr_pretty_url :: Text
, _sr_publishedDate :: Text -- "Nov 19, 2021"
, _sr_pubdate :: Text -- "2021-11-19 02:12:00+0000"
}
deriving (Show, Eq, Generic)
-- , _sr_parsed_url
-- , _sr_engines
......@@ -76,34 +104,117 @@ fetchSearxPage (FetchSearxParams { _fsp_language
, ("categories", "news") -- https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/70#note_3976
, ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
--, ("time_range", "None")
, ("language", encodeUtf8 $ T.pack $ show _fsp_language)
, ("language", encodeUtf8 $ langToSearx _fsp_language)
, ("format", "json")
] req
res <- httpLbs request _fsp_manager
let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
pure dec
insertSearxResponse :: (MonadBase IO m, FlowCmdM env err m)
=> User
-> CorpusId
-> ListId
-> Lang
-> Either Prelude.String SearxResponse
-> m ()
insertSearxResponse _ _ _ _ (Left _) = pure ()
insertSearxResponse user cId listId l (Right (SearxResponse { _srs_results })) = do
let docs = hyperdataDocumentFromSearxResult l <$> _srs_results
--printDebug "[triggerSearxSearch] docs" docs
-- docs :: [Either Text HyperdataDocument]
let docs' = catMaybes $ rightToMaybe <$> docs
Prelude.mapM_ (\(HyperdataDocument { _hd_title, _hd_publication_year, _hd_publication_date }) -> do
printDebug "[triggerSearxSearch] doc time" $
"[title] " <> (show _hd_title) <>
" :: [publication_year] " <> (show _hd_publication_year) <>
" :: [publication_date] " <> (show _hd_publication_date)
) docs'
--_ <- flowDataText user (DataNew [docs']) (Multi l) cId Nothing logStatus
let mCorpus = Nothing :: Maybe HyperdataCorpus
ids <- insertMasterDocs mCorpus (Multi l) docs'
_ <- Doc.add cId ids
(_masterUserId, _masterRootId, masterCorpusId)
<- getOrMk_RootWithCorpus (UserName userMaster) (Left "") mCorpus
let gp = GroupWithPosTag l CoreNLP HashMap.empty
ngs <- buildNgramsLists user cId masterCorpusId Nothing gp
_userListId <- flowList_DbRepo listId ngs
pure ()
-- TODO Make an async task out of this?
triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m)
=> CorpusId
=> User
-> CorpusId
-> API.Query
-> Lang
-> m ()
triggerSearxSearch cid q l = do
printDebug "[triggerSearxSearch] cid" cid
-> (JobLog -> m ())
-> m JobLog
triggerSearxSearch user cId q l logStatus = do
let numPages = 100
let jobLog = JobLog { _scst_succeeded = Just 0
, _scst_failed = Just 0
, _scst_remaining = Just numPages
, _scst_events = Just []
}
logStatus jobLog
printDebug "[triggerSearxSearch] cId" cId
printDebug "[triggerSearxSearch] q" q
printDebug "[triggerSearxSearch] l" l
cfg <- view hasConfig
uId <- getUserId user
let surl = _gc_frame_searx_url cfg
printDebug "[triggerSearxSearch] surl" surl
mListId <- defaultListMaybe cId
listId <- case mListId of
Nothing -> do
listId <- getOrMkList cId uId
pure listId
Just listId -> pure listId
printDebug "[triggerSearxSearch] listId" listId
manager <- liftBase $ newManager tlsManagerSettings
res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
, _fsp_manager = manager
, _fsp_pageno = 1
, _fsp_query = q
, _fsp_url = surl }
printDebug "[triggerSearxSearch] res" res
_ <- mapM (\page -> do
res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
, _fsp_manager = manager
, _fsp_pageno = page
, _fsp_query = q
, _fsp_url = surl }
insertSearxResponse user cId listId l res
logStatus $ JobLog { _scst_succeeded = Just page
, _scst_failed = Just 0
, _scst_remaining = Just (numPages - page)
, _scst_events = Just [] }
) [1..numPages]
--printDebug "[triggerSearxSearch] res" res
pure $ jobLogSuccess jobLog
hyperdataDocumentFromSearxResult :: Lang -> SearxResult -> Either T.Text HyperdataDocument
hyperdataDocumentFromSearxResult l (SearxResult { _sr_content, _sr_engine, _sr_pubdate, _sr_title }) = do
let mDate = parseTimeM False defaultTimeLocale "%Y-%m-%d %H:%M:%S+0000" (T.unpack _sr_pubdate) :: Maybe Day
let mGregorian = toGregorian <$> mDate
Right HyperdataDocument { _hd_bdd = Just "Searx"
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = Just _sr_title
, _hd_authors = Nothing
, _hd_institutes = Nothing
, _hd_source = Just _sr_engine
, _hd_abstract = _sr_content
, _hd_publication_date = T.pack <$> formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%S" <$> mDate
, _hd_publication_year = fromIntegral <$> sel1 <$> mGregorian
, _hd_publication_month = sel2 <$> mGregorian
, _hd_publication_day = sel3 <$> mGregorian
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ T.pack $ show l }
pure ()
......@@ -71,9 +71,9 @@ import Gargantext.Core.Ext.IMT (toSchoolName)
import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
import Gargantext.Core.Flow.Types
import Gargantext.Core.Text
import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
import Gargantext.Core.Text.List (buildNgramsLists)
import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
import Gargantext.Core.Text.List.Social (FlowSocialListWith)
import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
......
......@@ -279,7 +279,10 @@ instance ToNode HyperdataDocument where
where
n = maybe "No Title" (DT.take 255) (_hd_title h)
date = jour y m d
y = maybe 0 fromIntegral $ _hd_publication_year h
-- NOTE: There is no year '0' in postgres, there is year 1 AD and beofre that year 1 BC:
-- select '0001-01-01'::date, '0001-01-01'::date - '1 day'::interval;
-- 0001-01-01 0001-12-31 00:00:00 BC
y = maybe 1 fromIntegral $ _hd_publication_year h
m = fromMaybe 1 $ _hd_publication_month h
d = fromMaybe 1 $ _hd_publication_day h
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment