Implement search with given language (for abstracts etc)

parent 0d82e5a6
...@@ -32,6 +32,7 @@ library ...@@ -32,6 +32,7 @@ library
HAL.Doc.Corpus HAL.Doc.Corpus
HAL.Doc.EntityTree HAL.Doc.EntityTree
HAL.Doc.Struct HAL.Doc.Struct
HAL.Utils
Tree Tree
other-modules: other-modules:
Paths_crawlerHAL Paths_crawlerHAL
......
...@@ -7,6 +7,7 @@ import Data.Text ...@@ -7,6 +7,7 @@ import Data.Text
import HAL.Client import HAL.Client
import HAL.Doc.Corpus import HAL.Doc.Corpus
import HAL.Doc.Struct import HAL.Doc.Struct
import HAL.Utils (langAbstractS)
import Network.HTTP.Client (newManager) import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings) import Network.HTTP.Client.TLS (tlsManagerSettings)
import Protolude import Protolude
...@@ -72,7 +73,7 @@ countResults q = do ...@@ -72,7 +73,7 @@ countResults q = do
requestedFields :: Maybe ISO639_1 -> Text requestedFields :: Maybe ISO639_1 -> Text
requestedFields (Just EN) = "docid,title_s,en_abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s" requestedFields (Just EN) = "docid,title_s,en_abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s"
requestedFields (Just FR) = "docid,title_s,en_abstract_s,fr_abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s" requestedFields (Just lang) = "docid,title_s,en_abstract_s," <> langAbstractS lang <> ",submittedDate_s,source_s,authFullName_s,authOrganism_s"
requestedFields _ = requestedFields (Just EN) requestedFields _ = requestedFields (Just EN)
structFields :: Text structFields :: Text
......
...@@ -4,37 +4,49 @@ module HAL.Doc.Corpus where ...@@ -4,37 +4,49 @@ module HAL.Doc.Corpus where
import Control.Lens qualified as L import Control.Lens qualified as L
import Data.Aeson import Data.Aeson
import Data.Aeson.Key (fromText)
import Data.Default import Data.Default
import Data.Map.Strict qualified as Map
import GHC.Generics import GHC.Generics
import HAL.Utils (allLangs, langAbstractS)
import Data.LanguageCodes (ISO639_1(..))
import Protolude import Protolude
import Servant.API (ToHttpApiData(..)) import Servant.API (ToHttpApiData(..))
data Corpus = Corpus data Corpus = Corpus
{ _corpus_docid :: Text { _corpus_docid :: Text
, _corpus_title :: [Text] , _corpus_title :: [Text]
, _corpus_abstract :: [Text] , _corpus_abstract :: [Text]
, _corpus_date :: Maybe Text , _corpus_abstract_lang_map :: Map ISO639_1 [Text]
, _corpus_source :: Maybe Text , _corpus_date :: Maybe Text
, _corpus_authors_names :: [Text] , _corpus_source :: Maybe Text
, _corpus_authors_names :: [Text]
, _corpus_authors_affiliations :: [Text] , _corpus_authors_affiliations :: [Text]
, _corpus_struct_id :: [Int] , _corpus_struct_id :: [Int]
} deriving (Show, Generic) } deriving (Show, Generic)
L.makeLenses ''Corpus L.makeLenses ''Corpus
instance Default Corpus where instance Default Corpus where
def = Corpus "default Id" def def def def def def def def = Corpus "default Id" def def def def def def def def
instance FromJSON Corpus where instance FromJSON Corpus where
parseJSON = withObject "Corpus" $ parseJSON = withObject "Corpus" $ \o -> do
\o -> Corpus _corpus_docid <- (o .: "docid")
<$> (o .: "docid") _corpus_title <- (o .: "title_s" <|> return [])
<*> (o .: "title_s" <|> return []) _corpus_abstract <- (o .: "en_abstract_s" <|> return [])
<*> (o .: "en_abstract_s" <|> return []) _corpus_date <- (o .:? "submittedDate_s")
<*> (o .:? "submittedDate_s") _corpus_source <- (o .:? "source_s")
<*> (o .:? "source_s") _corpus_authors_names <- (o .: "authFullName_s" <|> return [])
<*> (o .: "authFullName_s" <|> return []) _corpus_authors_affiliations <- (o .: "authOrganism_s" <|> return [])
<*> (o .: "authOrganism_s" <|> return []) _corpus_struct_id <- (o .: "structId_i" <|> return [])
<*> (o .: "structId_i" <|> return [])
abstracts <-
mapM (\lang -> do
ma <- o .:? (fromText $ langAbstractS lang)
pure $ (\a -> (lang, a)) <$> ma) allLangs
let _corpus_abstract_lang_map = Map.fromList $ catMaybes abstracts
pure $ Corpus { .. }
instance ToHttpApiData Corpus where instance ToHttpApiData Corpus where
toUrlPiece _ = "docid,title_s,en_abstract_s,fr_abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s,structId_i" toUrlPiece _ = "docid,title_s,en_abstract_s,fr_abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s,structId_i"
module HAL.Utils where
import Data.LanguageCodes (ISO639_1(..), language)
import Data.Text qualified as T
import Protolude
allLangs :: [ISO639_1]
allLangs = enumFrom (toEnum 0) :: [ISO639_1]
langAbstractS :: ISO639_1 -> Text
langAbstractS lang = (T.pack $ language lang) <> "_abstract_s"
...@@ -2,9 +2,9 @@ module Tree where ...@@ -2,9 +2,9 @@ module Tree where
import Control.Lens.Getter ((^.)) import Control.Lens.Getter ((^.))
import Data.List.Split (chunksOf) import Data.List.Split (chunksOf)
import Data.Map qualified as Map import Data.Map.Strict qualified as Map
import Data.Map (insert) import Data.Map.Strict (insert)
import Data.Map.Internal (merge, preserveMissing, zipWithMatched) import Data.Map.Strict.Internal (merge, preserveMissing, zipWithMatched)
import Data.Text qualified as T import Data.Text qualified as T
import Data.Text.Format (format) import Data.Text.Format (format)
import Data.Text.Lazy qualified as TL import Data.Text.Lazy qualified as TL
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment