Commit 1dbd9392 authored by mzheng's avatar mzheng

renaming Corpus to Document for clarification

parent 3665ccda
......@@ -8,7 +8,7 @@ import Data.Text qualified as T
import HAL (getMetadataWithCursorOptsC, countResultsOpts', HalCrawlerOptions(..), defaultHalOptions, getMetadataWith, generateRequestByStructID)
import HAL.Types
import HAL.Doc
import HAL.Doc.Corpus (Corpus(..))
import HAL.Doc.Document (Document(..))
import Network.HTTP.Client (newManager)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Options.Applicative
......@@ -63,13 +63,13 @@ main = do run =<< execParser opts
-- case res of
-- (Left err) -> print err
-- (Right val) -> do
-- mapM_ printCorpus $ _docs val
-- mapM_ printDocument $ _docs val
-- this function is for debug purpose
printCorpus :: MonadIO m => Corpus -> m ()
printCorpus Corpus { .. } = do
putText $ "StructId: [" <> T.intercalate ", " (map (T.pack . show) _corpus_struct_id) <> "]"
putText $ "Authors affiliations: [" <> T.intercalate ", " _corpus_authors_affiliations <> "]"
-- | This function is for debug purpose
printDocument :: Document -> IO ()
printDocument Document { .. } = do
putText $ "StructId: [" <> T.intercalate ", " (map (T.pack . show) _document_struct_id) <> "]"
putText $ "Authors affiliations: [" <> T.intercalate ", " _document_authors_affiliations <> "]"
putText "------------"
......@@ -88,16 +88,16 @@ run (Fetch (FetchParams { fp_query, fp_limit, fp_lang })) = do
Right (_cnt, docsC) -> do
_ <- runConduit $
docsC
.| mapM_C printCorpus
.| mapM_C printDocument
.| sinkList
pure ()
where
opts = defaultHalOptions { _hco_debugLogs = True }
printCorpus Corpus { .. } = do
putText $ "docid: " <> _corpus_docid <> " [" <> (T.intercalate " " _corpus_title) <> "]"
putText $ " " <> (T.intercalate " " _corpus_abstract)
putText $ " " <> show _corpus_abstract_lang_map
putText $ " " <> show _corpus_original
printDocument Document { .. } = do
putText $ "docid: " <> _document_docid <> " [" <> (T.intercalate " " _document_title) <> "]"
putText $ " " <> (T.intercalate " " _document_abstract)
putText $ " " <> show _document_abstract_lang_map
putText $ " " <> show _document_original
putText "------------"
......
......@@ -29,7 +29,7 @@ library
HAL
HAL.Client
HAL.Doc
HAL.Doc.Corpus
HAL.Doc.Document
HAL.Doc.EntityTree
HAL.Doc.Struct
HAL.Types
......
......@@ -6,7 +6,7 @@ import Data.Aeson ( FromJSON )
import Data.LanguageCodes (ISO639_1(..))
import Data.Text qualified as T
import HAL.Client ( SortField(Asc), search, structure, searchCursor )
import HAL.Doc.Corpus ( Corpus )
import HAL.Doc.Document ( Document )
import HAL.Doc.Struct ( Struct )
import HAL.Types (Response(..))
import HAL.Utils (langAbstractS, toText)
......@@ -48,7 +48,7 @@ getMetadataWithCursorC :: Query
-- rows returned.
-> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ()))
-> IO (Either ClientError (Maybe Count, ConduitT () Document IO ()))
getMetadataWithCursorC = getMetadataWithCursorOptsC defaultHalOptions
-- | Fetch metadata using cursors
......@@ -62,7 +62,7 @@ getMetadataWithCursorOptsC :: HalCrawlerOptions
-- rows returned.
-> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ()))
-> IO (Either ClientError (Maybe Count, ConduitT () Document IO ()))
getMetadataWithCursorOptsC opts@HalCrawlerOptions { .. } q mb_limit lang = do
-- Basically this works as follows:
-- - fetch first page with cursor = "*"
......@@ -78,7 +78,7 @@ getMetadataWithCursorOptsC opts@HalCrawlerOptions { .. } q mb_limit lang = do
fq = queryWithLang lang
get' :: Count
-> (Maybe Count, ConduitT () Corpus IO ())
-> (Maybe Count, ConduitT () Document IO ())
get' numFound' =
( Just numResults
, producer "*"
......@@ -89,7 +89,7 @@ getMetadataWithCursorOptsC opts@HalCrawlerOptions { .. } q mb_limit lang = do
limit = min numFound' $ fromMaybe numFound' mb_limit
numResults = limit
producer :: Text -> ConduitT () Corpus IO ()
producer :: Text -> ConduitT () Document IO ()
producer cursor = do
let endpoint = searchCursor (Just q) (Just $ requestedFields lang) fq sort_ (Just $ fromIntegral _hco_batchSize) (Just cursor)
liftIO $ debugLog opts $ "[getMetadataWithCursorLangC] producer: " <> show cursor
......@@ -116,7 +116,7 @@ countResultsOpts' opts q lang = do
-- Set rows=0 to query number of results
-- https://api.archives-ouvertes.fr/docs/search#rows
-- First, estimate the total number of documents
eRes <- runHalAPIClient opts $ search (Just q) (Just $ requestedFields Nothing) fq Nothing (Just 0) (Just 0) :: IO (Either ClientError (Response Corpus))
eRes <- runHalAPIClient opts $ search (Just q) (Just $ requestedFields Nothing) fq Nothing (Just 0) (Just 0) :: IO (Either ClientError (Response Document))
pure (fromIntegral . _numFound <$> eRes)
where
fq = queryWithLang lang
......@@ -135,7 +135,7 @@ getMetadataWith :: [Query]
-- rows returned.
-> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Response Corpus))
-> IO (Either ClientError (Response Document))
getMetadataWith qs start_ limit lang = do
runHalAPIClient defaultHalOptions $ search (Just q) (Just $ requestedFields lang) [] Nothing start_ (fromIntegral <$> limit)
where
......@@ -152,7 +152,7 @@ getMetadataWithC :: [Query]
-- rows returned.
-> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ()))
-> IO (Either ClientError (Maybe Count, ConduitT () Document IO ()))
getMetadataWithC qs start_ limit lang = getMetadataWithOptsC defaultHalOptions (qs <> queryWithLang lang) start_ limit lang
getMetadataWithOptsC :: HalCrawlerOptions
......@@ -166,14 +166,14 @@ getMetadataWithOptsC :: HalCrawlerOptions
-- rows returned.
-> Maybe ISO639_1
-- ^ An optional language for the search.
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ()))
-> IO (Either ClientError (Maybe Count, ConduitT () Document IO ()))
getMetadataWithOptsC opts@HalCrawlerOptions { .. } qs mb_offset mb_limit lang = do
-- First, estimate the total number of documents
eCount <- countResults qs
pure $ get' <$> eCount
where
get' :: Count
-> (Maybe Count, ConduitT () Corpus IO ())
-> (Maybe Count, ConduitT () Document IO ())
get' numFound' =
( Just numResults
, yieldMany [0..]
......@@ -188,7 +188,7 @@ getMetadataWithOptsC opts@HalCrawlerOptions { .. } qs mb_offset mb_limit lang =
numResults = limit - fromIntegral offset
numPages = numResults `div` fromIntegral _hco_batchSize + 1
getPage :: Start -> Int -> IO [Corpus]
getPage :: Start -> Int -> IO [Document]
getPage start' pageNum = do
let offset = start' + pageNum * _hco_batchSize
debugLog opts $ "[getMetadataWithLangC] getPage: " <> show offset
......@@ -208,7 +208,7 @@ countResults qs = do
-- Set rows=0 to query number of results
-- https://api.archives-ouvertes.fr/docs/search#rows
-- First, estimate the total number of documents
eRes <- runHalAPIClient defaultHalOptions $ search (Just q) (Just $ requestedFields Nothing) [] Nothing (Just 0) (Just 0) :: IO (Either ClientError (Response Corpus))
eRes <- runHalAPIClient defaultHalOptions $ search (Just q) (Just $ requestedFields Nothing) [] Nothing (Just 0) (Just 0) :: IO (Either ClientError (Response Document))
pure (fromIntegral . _numFound <$> eRes)
where
q = joinQueries qs
......@@ -258,7 +258,7 @@ runStructureRequest :: Maybe Text -> IO (Either ClientError (Response Struct))
runStructureRequest rq =
runHalAPIClient defaultHalOptions $ structure (Just structFields) rq (Just 10000)
runSearchRequest :: [Text] -> IO (Either ClientError (Response Corpus))
runSearchRequest :: [Text] -> IO (Either ClientError (Response Document))
runSearchRequest rq =
runHalAPIClient defaultHalOptions $ search (Just q) (Just $ requestedFields Nothing) [] Nothing Nothing Nothing
where
......
module HAL.Doc
( module HAL.Doc.EntityTree
, module HAL.Doc.Corpus )
, module HAL.Doc.Document )
where
import HAL.Doc.EntityTree
import HAL.Doc.Corpus
import HAL.Doc.Document
......@@ -2,7 +2,7 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
module HAL.Doc.Corpus where
module HAL.Doc.Document where
import Control.Lens qualified as L
import Data.Aeson ( (.:), (.:?), withObject, FromJSON(parseJSON), Object )
......@@ -17,50 +17,50 @@ import qualified Data.Text as T
import Data.Aeson.Types (Parser)
import Prelude qualified as P
data Corpus = Corpus
{ _corpus_docid :: Text
, _corpus_title :: [Text]
, _corpus_abstract :: [Text]
, _corpus_abstract_lang_map :: Map ISO639_1 [Text]
, _corpus_date :: Maybe Text
, _corpus_source :: Maybe Text
, _corpus_authors_names :: [Text]
, _corpus_authors_affiliations :: [Text]
, _corpus_struct_id :: [Int]
, _corpus_original :: Object
data Document = Document
{ _document_docid :: Text
, _document_title :: [Text]
, _document_abstract :: [Text]
, _document_abstract_lang_map :: Map ISO639_1 [Text]
, _document_date :: Maybe Text
, _document_source :: Maybe Text
, _document_authors_names :: [Text]
, _document_authors_affiliations :: [Text]
, _document_struct_id :: [Int]
, _document_original :: Object
} deriving (Show, Generic)
L.makeLenses ''Corpus
L.makeLenses ''Document
instance Default Corpus where
def = Corpus "default Id" def def def def def def def def mempty
instance Default Document where
def = Document "default Id" def def def def def def def def mempty
instance FromJSON Corpus where
parseJSON = withObject "Corpus" $ \o -> do
_corpus_docid <- o .: "docid"
_corpus_title <- o .: "title_s" <|> return []
_corpus_abstract <- o .: "en_abstract_s" <|> return []
_corpus_date <- o .:? "submittedDate_s"
_corpus_source <- o .:? "source_s"
_corpus_authors_names <- o .: "authFullName_s" <|> return []
instance FromJSON Document where
parseJSON = withObject "Document" $ \o -> do
_document_docid <- o .: "docid"
_document_title <- o .: "title_s" <|> return []
_document_abstract <- o .: "en_abstract_s" <|> return []
_document_date <- o .:? "submittedDate_s"
_document_source <- o .:? "source_s"
_document_authors_names <- o .: "authFullName_s" <|> return []
idsNames <- o .:? "structIdName_fs" :: Parser (Maybe [Text]) --unsparsed (contains a _FacetSep_)
let structIdname = getStructIdsNames idsNames
let _corpus_struct_id = map fst structIdname
let _corpus_authors_affiliations = map snd structIdname
let _document_struct_id = map fst structIdname
let _document_authors_affiliations = map snd structIdname
abstracts <-
mapM (\lang -> do
ma <- o .:? fromString (T.unpack $ langAbstractS lang)
pure $ (\a -> (lang, a)) <$> ma) allLangs
let _corpus_abstract_lang_map = Map.fromList $ catMaybes abstracts
let _document_abstract_lang_map = Map.fromList $ catMaybes abstracts
let _corpus_original = o
let _document_original = o
pure $ Corpus { .. }
pure $ Document { .. }
-- | this function parses the field structIdName_fs that looks like :
-- | Parses the field structIdName_fs that looks like :
-- > StructId_FacetSep_StructName
--
-- returns [(StructId, StructName)]
......@@ -71,5 +71,5 @@ getStructIdsNames (Just idsNames) = map (\tab -> (P.read (T.unpack (P.head tab))
splitInstitutes = P.map (T.splitOn (T.pack "_FacetSep_"))
getStructIdsNames Nothing = []
instance ToHttpApiData Corpus where
instance ToHttpApiData Document where
toUrlPiece _ = "docid,title_s,en_abstract_s,fr_abstract_s,submittedDate_s,source_s,authFullName_s,structId_i,structIdName_fs"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment