Commit 22e2da48 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[searx] first draft of searx parsing, updated stack to lts 18.4

parent f8ba5ba1
Pipeline #1681 passed with stage
in 54 minutes and 35 seconds
This diff is collapsed.
......@@ -188,6 +188,7 @@ library:
- random
- rdf4h
- regex-compat
- regex-tdfa
- resource-pool
- resourcet
- safe
......
......@@ -166,7 +166,7 @@ newEnv port file = do
when (port /= settings' ^. appPort) $
panic "TODO: conflicting settings of port"
config' <- readConfig file
config' <- readConfig file
self_url <- parseBaseUrl $ "http://0.0.0.0:" <> show port
dbParam <- databaseParameters file
pool <- newPool dbParam
......
......@@ -36,25 +36,26 @@ import Test.QuickCheck.Arbitrary
import Gargantext.Prelude
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
import qualified Gargantext.API.Admin.Orchestrator.Types as T
import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Node.Corpus.New.File
import Gargantext.API.Node.Corpus.Searx
import Gargantext.API.Node.Corpus.Types
import Gargantext.API.Node.Types
import Gargantext.Core (Lang(..){-, allLangs-})
import Gargantext.Database.Action.Mail (sendMail)
import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
import Gargantext.Database.Action.User (getUserId)
import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
import Gargantext.Database.Action.Mail (sendMail)
import Gargantext.Database.Action.Node (mkNodeWithParent)
import Gargantext.Database.Action.User (getUserId)
import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
import Gargantext.Database.Query.Table.Node (getNodeWith)
import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
import Gargantext.Database.Schema.Node (node_hyperdata)
import qualified Gargantext.Database.GargDB as GargDB
import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
------------------------------------------------------------------------
{-
......@@ -125,28 +126,11 @@ info :: FlowCmdM env err m => UserId -> m ApiInfo
info _u = pure $ ApiInfo API.externalAPIs
------------------------------------------------------------------------
data Database = Empty
| PubMed
| HAL
| IsTex
| Isidore
deriving (Eq, Show, Generic)
deriveJSON (unPrefix "") ''Database
instance ToSchema Database
database2origin :: Database -> DataOrigin
database2origin Empty = InternalOrigin T.IsTex
database2origin PubMed = ExternalOrigin T.PubMed
database2origin HAL = ExternalOrigin T.HAL
database2origin IsTex = ExternalOrigin T.IsTex
database2origin Isidore = ExternalOrigin T.Isidore
------------------------------------------------------------------------
data WithQuery = WithQuery
{ _wq_query :: !Text
, _wq_databases :: !Database
, _wq_datafield :: !Datafield
, _wq_lang :: !Lang
, _wq_node_id :: !Int
}
......@@ -190,36 +174,51 @@ addToCorpusWithQuery :: FlowCmdM env err m
-> Maybe Integer
-> (JobLog -> m ())
-> m JobLog
addToCorpusWithQuery user cid (WithQuery q dbs l _nid) maybeLimit logStatus = do
addToCorpusWithQuery user cid (WithQuery q dbs datafield l _nid) maybeLimit logStatus = do
-- TODO ...
logStatus JobLog { _scst_succeeded = Just 0
, _scst_failed = Just 0
, _scst_remaining = Just 5
, _scst_events = Just []
}
printDebug "addToCorpusWithQuery" (cid, dbs)
-- TODO add cid
-- TODO if cid is folder -> create Corpus
-- if cid is corpus -> add to corpus
-- if cid is root -> create corpus in Private
txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
logStatus JobLog { _scst_succeeded = Just 2
, _scst_failed = Just 0
, _scst_remaining = Just 1
, _scst_events = Just []
}
cids <- mapM (\txt -> flowDataText user txt (Multi l) cid) txts
printDebug "corpus id" cids
printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
sendMail user
-- TODO ...
pure JobLog { _scst_succeeded = Just 3
, _scst_failed = Just 0
, _scst_remaining = Just 0
, _scst_remaining = Just 3
, _scst_events = Just []
}
printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
printDebug "[addToCorpusWithQuery] datafield" datafield
case datafield of
Web -> do
printDebug "[addToCorpusWithQuery] processing web request" datafield
_ <- triggerSearxSearch cid q l
pure JobLog { _scst_succeeded = Just 3
, _scst_failed = Just 0
, _scst_remaining = Just 0
, _scst_events = Just []
}
_ -> do
-- TODO add cid
-- TODO if cid is folder -> create Corpus
-- if cid is corpus -> add to corpus
-- if cid is root -> create corpus in Private
txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
logStatus JobLog { _scst_succeeded = Just 2
, _scst_failed = Just 0
, _scst_remaining = Just 1
, _scst_events = Just []
}
cids <- mapM (\txt -> flowDataText user txt (Multi l) cid) txts
printDebug "corpus id" cids
printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
sendMail user
-- TODO ...
pure JobLog { _scst_succeeded = Just 3
, _scst_failed = Just 0
, _scst_remaining = Just 0
, _scst_events = Just []
}
type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
......
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.API.Node.Corpus.Searx where
import Control.Lens (view)
import qualified Data.Aeson as Aeson
import Data.Aeson.TH (deriveJSON)
import qualified Data.Text as T
import GHC.Generics (Generic)
import Network.HTTP.Client
import Network.HTTP.Client.TLS
import Protolude (encodeUtf8, Text)
import Gargantext.Prelude
import Gargantext.Prelude.Config
import Gargantext.Core (Lang(..))
import qualified Gargantext.Core.Text.Corpus.API as API
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Database.Action.Flow (FlowCmdM)
import Gargantext.Database.Admin.Types.Node (CorpusId)
import Gargantext.Database.Prelude (hasConfig)
data SearxResult = SearxResult
{ _sr_url :: Text
, _sr_title :: Text
, _sr_content :: Text
, _sr_engine :: Text
, _sr_score :: Double
, _sr_category :: Text
, _sr_pretty_url :: Text }
deriving (Show, Eq, Generic)
-- , _sr_parsed_url
-- , _sr_engines
-- , _sr_positions
$(deriveJSON (unPrefix "_sr_") ''SearxResult)
data SearxResponse = SearxResponse
{ _srs_query :: Text
, _srs_number_of_results :: Int
, _srs_results :: [SearxResult] }
deriving (Show, Eq, Generic)
-- , _srs_answers
-- , _srs_corrections
-- , _srs_infoboxes
-- , _srs_suggestions :: [Text]
-- , _srs_unresponsive_engines :: [Text] }
$(deriveJSON (unPrefix "_srs_") ''SearxResponse)
triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m)
=> CorpusId
-> API.Query
-> Lang
-> m ()
triggerSearxSearch cid q l = do
printDebug "[triggerSearxSearch] cid" cid
printDebug "[triggerSearxSearch] q" q
printDebug "[triggerSearxSearch] l" l
cfg <- view hasConfig
let surl = _gc_frame_searx_url cfg
printDebug "[triggerSearxSearch] surl" surl
res <- liftBase $ do
manager <- newManager tlsManagerSettings
req <- parseRequest $ T.unpack surl
let request = urlEncodedBody [ ("category_general", "1")
, ("q", encodeUtf8 q)
, ("pageno", "1")
, ("time_range", "None")
, ("language", "en-US") -- TODO
, ("format", "json")] req
httpLbs request manager
let dec = Aeson.decode $ responseBody res :: (Maybe SearxResponse)
printDebug "[triggerSearxSearch] dec" dec
pure ()
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.API.Node.Corpus.Types where
import Control.Lens hiding (elements, Empty)
import Control.Monad.Fail (fail)
import Data.Aeson
import Data.Aeson.TH (deriveJSON)
import Data.Monoid (mempty)
import Data.Swagger
import Data.Text (Text)
import qualified Data.Text as T
import GHC.Generics (Generic)
import Text.Regex.TDFA ((=~))
import Protolude ((++))
import Gargantext.Prelude
import qualified Gargantext.API.Admin.Orchestrator.Types as T
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Database.Action.Flow (DataOrigin(..))
data Database = Empty
| PubMed
| HAL
| IsTex
| Isidore
deriving (Eq, Show, Generic)
deriveJSON (unPrefix "") ''Database
instance ToSchema Database
database2origin :: Database -> DataOrigin
database2origin Empty = InternalOrigin T.IsTex
database2origin PubMed = ExternalOrigin T.PubMed
database2origin HAL = ExternalOrigin T.HAL
database2origin IsTex = ExternalOrigin T.IsTex
database2origin Isidore = ExternalOrigin T.Isidore
------------------------------------------------------------------------
data Datafield = Gargantext
| External (Maybe Database)
| Web
| Files
deriving (Eq, Show, Generic)
instance FromJSON Datafield where
parseJSON = withText "Datafield" $ \text ->
case text of
"Gargantext" -> pure Gargantext
"Web" -> pure Web
"Files" -> pure Files
v ->
let (preExternal, _, postExternal) = v =~ ("External " :: Text) :: (Text, Text, Text)
in
if preExternal == "" then do
db <- parseJSON $ String postExternal
pure $ External db
else fail $ "Cannot match patterh 'External <db>' for string " ++ (T.unpack v)
instance ToJSON Datafield where
toJSON (External db) = toJSON $ "External " ++ (show db)
toJSON s = toJSON $ show s
instance ToSchema Datafield where
declareNamedSchema _ = do
return $ NamedSchema (Just "Datafield") $ mempty
& type_ ?~ SwaggerObject
......@@ -69,7 +69,7 @@ data TermType lang
, _tt_ngramsSize :: !Int
, _tt_model :: !(Maybe (Tries Token ()))
}
deriving Generic
deriving (Generic)
makeLenses ''TermType
--group :: [Text] -> [Text]
......
......@@ -23,9 +23,6 @@ partitionWithKey :: (Ord a, Hashable k) => (k -> a -> Bool) -> HashMap k a -> (H
partitionWithKey p m = (HashMap.filterWithKey p m, HashMap.filterWithKey (\k -> not . p k) m)
mapKeys :: (Ord k2, Hashable k2) => (k1->k2) -> HashMap k1 a -> HashMap k2 a
mapKeys f = HashMap.fromList . HashMap.foldrWithKey (\k x xs -> (f k, x) : xs) []
------------------------------------------------------------------------
-- getKeyWithMaxValue :: Hashable k => HashMap k a -> Maybe k
getKeysOrderedByValueMaxFirst :: (Ord k, Hashable k, Ord a) => HashMap k a -> [k]
......
resolver:
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/17/10.yaml
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/18/4.yaml
flags: {}
extra-package-dbs: []
packages:
......@@ -94,6 +94,7 @@ extra-deps:
- json-stream-0.4.2.4@sha256:8b7f17d54a6e1e6311756270f8bcf51e91bab4300945400de66118470dcf51b9,4716
- located-base-0.1.1.1@sha256:7c6395f2b6fbf2d5f76c3514f774423838c0ea94e1c6a5530dd3c94b30c9d1c8,1904
- logging-effect-1.3.12@sha256:72d168dd09887649ba9501627219b6027cbec2d5541931555b7885b133785ce3,1679
- MissingH-1.4.3.0@sha256:32f9892ec98cd21df4f4d3ed8d95a3831ae74287ea0641d6f09b2dc6ef061d39,4859
- monoid-extras-0.5.1@sha256:438dbfd7b4dce47d8f0ca577f56caf94bd1e21391afa545cad09fe7cf2e5793d,2333
- rake-0.0.1@sha256:3380f6567fb17505d1095b7f32222c0b631fa04126ad39726c84262da99c08b3,2025
- servant-cassava-0.10.1@sha256:07e7b6ca67cf57dcb4a0041a399a25d058844505837c6479e01d62be59d01fdf,1665
......@@ -103,3 +104,4 @@ extra-deps:
- stemmer-0.5.2@sha256:823aec56249ec2619f60a2c0d1384b732894dbbbe642856d337ebfe9629a0efd,4082
- xmlbf-0.6.1@sha256:57867fcb39e0514d17b3328ff5de8d241a18482fc89bb742d9ed820a6a2a5187,1540
- xmlbf-xeno-0.2@sha256:39f70fced6052524c290cf595f114661c721452e65fc3e0953a44e7682a6a6b0,950
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment