Implement search with given language (for abstracts etc)

parent 0d82e5a6
......@@ -32,6 +32,7 @@ library
HAL.Doc.Corpus
HAL.Doc.EntityTree
HAL.Doc.Struct
HAL.Utils
Tree
other-modules:
Paths_crawlerHAL
......
......@@ -7,6 +7,7 @@ import Data.Text
import HAL.Client
import HAL.Doc.Corpus
import HAL.Doc.Struct
import HAL.Utils (langAbstractS)
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Protolude
......@@ -72,7 +73,7 @@ countResults q = do
requestedFields :: Maybe ISO639_1 -> Text
requestedFields (Just EN) = "docid,title_s,en_abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s"
requestedFields (Just FR) = "docid,title_s,en_abstract_s,fr_abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s"
requestedFields (Just lang) = "docid,title_s,en_abstract_s," <> langAbstractS lang <> ",submittedDate_s,source_s,authFullName_s,authOrganism_s"
requestedFields _ = requestedFields (Just EN)
structFields :: Text
......
......@@ -4,8 +4,12 @@ module HAL.Doc.Corpus where
import Control.Lens qualified as L
import Data.Aeson
import Data.Aeson.Key (fromText)
import Data.Default
import Data.Map.Strict qualified as Map
import GHC.Generics
import HAL.Utils (allLangs, langAbstractS)
import Data.LanguageCodes (ISO639_1(..))
import Protolude
import Servant.API (ToHttpApiData(..))
......@@ -13,6 +17,7 @@ data Corpus = Corpus
{ _corpus_docid :: Text
, _corpus_title :: [Text]
, _corpus_abstract :: [Text]
, _corpus_abstract_lang_map :: Map ISO639_1 [Text]
, _corpus_date :: Maybe Text
, _corpus_source :: Maybe Text
, _corpus_authors_names :: [Text]
......@@ -22,19 +27,26 @@ data Corpus = Corpus
L.makeLenses ''Corpus
instance Default Corpus where
def = Corpus "default Id" def def def def def def def
def = Corpus "default Id" def def def def def def def def
instance FromJSON Corpus where
parseJSON = withObject "Corpus" $
\o -> Corpus
<$> (o .: "docid")
<*> (o .: "title_s" <|> return [])
<*> (o .: "en_abstract_s" <|> return [])
<*> (o .:? "submittedDate_s")
<*> (o .:? "source_s")
<*> (o .: "authFullName_s" <|> return [])
<*> (o .: "authOrganism_s" <|> return [])
<*> (o .: "structId_i" <|> return [])
parseJSON = withObject "Corpus" $ \o -> do
_corpus_docid <- (o .: "docid")
_corpus_title <- (o .: "title_s" <|> return [])
_corpus_abstract <- (o .: "en_abstract_s" <|> return [])
_corpus_date <- (o .:? "submittedDate_s")
_corpus_source <- (o .:? "source_s")
_corpus_authors_names <- (o .: "authFullName_s" <|> return [])
_corpus_authors_affiliations <- (o .: "authOrganism_s" <|> return [])
_corpus_struct_id <- (o .: "structId_i" <|> return [])
abstracts <-
mapM (\lang -> do
ma <- o .:? (fromText $ langAbstractS lang)
pure $ (\a -> (lang, a)) <$> ma) allLangs
let _corpus_abstract_lang_map = Map.fromList $ catMaybes abstracts
pure $ Corpus { .. }
instance ToHttpApiData Corpus where
toUrlPiece _ = "docid,title_s,en_abstract_s,fr_abstract_s,submittedDate_s,source_s,authFullName_s,authOrganism_s,structId_i"
module HAL.Utils where
import Data.LanguageCodes (ISO639_1(..), language)
import Data.Text qualified as T
import Protolude
allLangs :: [ISO639_1]
allLangs = enumFrom (toEnum 0) :: [ISO639_1]
langAbstractS :: ISO639_1 -> Text
langAbstractS lang = (T.pack $ language lang) <> "_abstract_s"
......@@ -2,9 +2,9 @@ module Tree where
import Control.Lens.Getter ((^.))
import Data.List.Split (chunksOf)
import Data.Map qualified as Map
import Data.Map (insert)
import Data.Map.Internal (merge, preserveMissing, zipWithMatched)
import Data.Map.Strict qualified as Map
import Data.Map.Strict (insert)
import Data.Map.Strict.Internal (merge, preserveMissing, zipWithMatched)
import Data.Text qualified as T
import Data.Text.Format (format)
import Data.Text.Lazy qualified as TL
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment