Commit a90c4419 authored by mzheng's avatar mzheng

build _corpus_struct_id and _corpus_authors_affiliations with structIdName_fs

parent 827e9bdd
......@@ -57,18 +57,21 @@ opts = info (params <**> helper)
<> header "crawlerHAL-exe")
main :: IO ()
main = do
res <- getMetadataWith [generateRequestByStructID "camera" imt] (Just 0) (Just 55) (Just EN)
case res of
(Left err) -> print err
(Right val) -> mapM_ (print . cleanShow) $ _docs val
cleanShow :: Corpus -> Text
cleanShow corp = cleanAuthorsAffiliations corp
main = do run =<< execParser opts
-- res <- getMetadataWith [generateRequestByStructID "(en_title_t:(glass) OR en_abstract_t:(glass)) AND (language_s:en)" imt] (Just 0) (Just 555) (Just EN)
-- case res of
-- (Left err) -> print err
-- (Right val) -> do
-- mapM_ printCorpus $ _docs val
-- this function is for debug purpose
printCorpus :: MonadIO m => Corpus -> m ()
printCorpus Corpus { .. } = do
putText $ "StructId: [" <> T.intercalate ", " (map (T.pack . show) _corpus_struct_id) <> "]"
putText $ "Authors affiliations: [" <> T.intercalate ", " _corpus_authors_affiliations <> "]"
putText "------------"
cleanAuthorsAffiliations :: Corpus -> Text
cleanAuthorsAffiliations corp = T.append "Authors affiliations : " $ T.intercalate " | " $ _corpus_authors_affiliations corp
run :: Command -> IO ()
run (Count (CountParams { cp_query, cp_lang })) = do
......@@ -123,8 +126,8 @@ run (Fetch (FetchParams { fp_query, fp_limit, fp_lang })) = do
-- OR structId_i:1048346
-- OR structId_i:352124)|]
imt :: [T.Text]
imt = [
-- imt :: [T.Text]
-- imt = [
-- "224096"
-- ,"144103"
-- ,"84538"
......@@ -140,6 +143,6 @@ imt = [
-- ,"481355"
-- ,"469216"
-- ,"542824"
"6279"
-- ,"6279"
-- ,"29212"
]
-- ]
......@@ -231,7 +231,7 @@ baseFields = [ "docid"
, "source_s"
, "authFullName_s"
, "authOrganism_s"
, "instStructName_s" ]
, "structIdName_fs" ]
structFields :: Text
structFields = "docid,label_s,parentDocid_i"
......
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
module HAL.Doc.Corpus where
import Control.Lens qualified as L
import Data.Aeson ( (.:), (.:?), withObject, FromJSON(parseJSON), Object )
import Data.Aeson --( (.:), (.:?), withObject, FromJSON(parseJSON), Object )
import Data.Default (Default(..))
import Data.Map.Strict qualified as Map
import Data.String (IsString(fromString))
......@@ -12,6 +14,8 @@ import Data.LanguageCodes (ISO639_1(..))
import Protolude
import Servant.API (ToHttpApiData(..))
import qualified Data.Text as T
import Data.Aeson.Types (Parser)
import Prelude qualified as P
data Corpus = Corpus
{ _corpus_docid :: Text
......@@ -38,8 +42,13 @@ instance FromJSON Corpus where
_corpus_date <- o .:? "submittedDate_s"
_corpus_source <- o .:? "source_s"
_corpus_authors_names <- o .: "authFullName_s" <|> return []
_corpus_authors_affiliations <- o .: "instStructName_s" <|> return []
_corpus_struct_id <- o .: "structId_i" <|> return []
idsNames <- o .: "structIdName_fs" :: Parser [Text] --unsparsed (contains a _FacetSep_)
let structIdname = getStructIdsNames idsNames
let _corpus_struct_id = map fst structIdname
let _corpus_authors_affiliations = map snd structIdname
abstracts <-
mapM (\lang -> do
......@@ -51,5 +60,15 @@ instance FromJSON Corpus where
pure $ Corpus { .. }
-- | this function parses the field structIdName_fs that looks like :
-- > StructId_FacetSep_StructName
--
-- returns [(StructId, StructName)]
getStructIdsNames :: [T.Text] -> [(Int, T.Text)]
getStructIdsNames idsNames = map (\tab -> (P.read (T.unpack (P.head tab)) :: Int, P.last tab)) $ splitInstitutes idsNames
where
splitInstitutes :: [T.Text] -> [[T.Text]]
splitInstitutes = P.map (T.splitOn (T.pack "_FacetSep_"))
instance ToHttpApiData Corpus where
toUrlPiece _ = "docid,title_s,en_abstract_s,fr_abstract_s,submittedDate_s,source_s,authFullName_s,instStructName_s,structId_i"
toUrlPiece _ = "docid,title_s,en_abstract_s,fr_abstract_s,submittedDate_s,source_s,authFullName_s,structId_i,structIdName_fs"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment