[lang] fixes to requestedFields, Main updated, query is an array now

parent 240a13e7
......@@ -21,10 +21,10 @@ import Tree
data CountParams = CountParams
{ cp_query :: T.Text }
{ cp_query :: [ T.Text ] }
data FetchParams = FetchParams
{ fp_query :: T.Text }
{ fp_query :: [ T.Text ] }
data Command =
Count CountParams
......@@ -33,12 +33,12 @@ data Command =
countParams :: Parser Command
countParams = Count <$>
(CountParams
<$> strArgument (metavar "query"))
<$> many (strArgument (metavar "query")))
fetchParams :: Parser Command
fetchParams = Fetch <$>
(FetchParams
<$> strArgument (metavar "query"))
<$> many (strArgument (metavar "query")))
params :: Parser Command
params = subparser
......@@ -60,20 +60,27 @@ main = run =<< execParser opts
run :: Command -> IO ()
run (Count (CountParams { cp_query })) = do
res <- getMetadataWithC (cp_query) (Just 0) Nothing Nothing
res <- getMetadataWithC cp_query (Just 0) Nothing Nothing
case res of
Left err -> putText $ show err
Right (cnt, _docsC) -> putText $ show cnt
run (Fetch (FetchParams { fp_query })) = do
res <- getMetadataWithC (fp_query) (Just 0) Nothing Nothing
res <- getMetadataWithC fp_query (Just 0) Nothing Nothing
case res of
Left err -> putText $ show err
Right (_cnt, docsC) -> do
_ <- runConduit $
docsC
.| mapM_C (\(Corpus { .. }) -> putText $ "docid: " <> show _corpus_docid)
.| mapM_C printCorpus
.| sinkList
pure ()
where
printCorpus Corpus { .. } = do
putText $ "docid: " <> _corpus_docid <> " [" <> (T.intercalate " " _corpus_title) <> "]"
putText $ " " <> (T.intercalate " " _corpus_abstract)
putText $ " " <> show _corpus_abstract_lang_map
putText $ " " <> show _corpus_original
putText "------------"
-- data
......
-- Generated by stack2cabal
with-compiler: ghc-9.2.8
with-compiler: ghc-8.10.7
packages:
./
......
......@@ -48,7 +48,7 @@ library
RecordWildCards
TypeOperators
build-depends:
aeson >= 2.1.0 && < 2.3
aeson >= 1.5.6.0 && < 1.6
, base >=4.7 && <5
, bytestring >= 0.11.0 && < 0.13
, conduit >= 1.3.5 && < 1.4
......@@ -65,10 +65,10 @@ library
, servant >= 0.19 && < 0.21
, servant-client >= 0.19 && < 0.21
, split >= 0.2.3.5 && < 0.3
, text >= 2.0.2 && < 2.1
, text >= 1.2.3.0 && < 2.1
, text-format >= 0.3.2.1 && < 0.4
, utf8-string >= 1.0.2 && < 1.1
, vector >= 0.13.0 && < 0.14
, vector >= 0.12.0 && < 0.14
default-language: Haskell2010
executable crawlerHAL-exe
......@@ -88,7 +88,7 @@ executable crawlerHAL-exe
TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
aeson >= 2.1.0 && < 2.3
aeson >= 1.5.6 && < 1.6
, base >=4.7 && <5
, bytestring >= 0.11.0 && < 0.13
, conduit >= 1.3.5 && < 1.4
......@@ -106,9 +106,9 @@ executable crawlerHAL-exe
, servant >= 0.19 && < 0.21
, servant-client >= 0.19 && < 0.21
, split >= 0.2.3.5 && < 0.3
, text >= 2.0.2 && < 2.1
, text >= 1.2.3.0 && < 2.1
, utf8-string >= 1.0.2 && < 1.1
, vector >= 0.13.0 && < 0.14
, vector >= 0.12.0 && < 0.14
default-language: Haskell2010
test-suite halCrawler-test
......@@ -129,7 +129,7 @@ test-suite halCrawler-test
TypeOperators
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends:
aeson >= 2.1.0 && < 2.3
aeson >= 1.5.6 && < 1.6
, base >=4.7 && <5
, bytestring >= 0.11.0 && < 0.13
, conduit >= 1.3.5 && < 1.4
......@@ -147,7 +147,7 @@ test-suite halCrawler-test
, servant >= 0.19 && < 0.21
, servant-client >= 0.19 && < 0.21
, split >= 0.2.3.5 && < 0.3
, text >= 2.0.2 && < 2.1
, text >= 1.2.3.0 && < 2.1
, utf8-string >= 1.0.2 && < 1.1
, vector >= 0.13.0 && < 0.14
, vector >= 0.12.0 && < 0.14
default-language: Haskell2010
......@@ -30,14 +30,14 @@ getMetadataWith :: Query
getMetadataWith q start_ limit lang = do
runHalAPIClient $ search (Just $ requestedFields lang) [q] Nothing start_ limit
getMetadataWithC :: Query
getMetadataWithC :: [Query]
-> Maybe Start
-> Maybe Limit
-> Maybe ISO639_1
-> IO (Either ClientError (Maybe Count, ConduitT () Corpus IO ()))
getMetadataWithC q start_ limit lang = do
getMetadataWithC qs start_ limit lang = do
-- First, estimate the total number of documents
eCount <- countResults q
eCount <- countResults qs
pure $ get' <$> eCount
where
get' :: Count
......@@ -55,8 +55,9 @@ getMetadataWithC q start_ limit lang = do
getPage :: Start -> Int -> IO [Corpus]
getPage start' pageNum = do
-- putText $ "requestedFields: " <> (show $ requestedFields lang)
let offset = start' + pageNum * batchSize
eRes <- runHalAPIClient $ search (Just $ requestedFields lang) [q] Nothing (Just offset) (Just $ fromIntegral batchSize)
eRes <- runHalAPIClient $ search (Just $ requestedFields lang) qs Nothing (Just offset) (Just $ fromIntegral batchSize)
pure $ case eRes of
Left _ -> []
Right (Response { _docs }) -> _docs
......@@ -65,10 +66,10 @@ getMetadataWithC q start_ limit lang = do
-- putText $ show _corpus_title
-- pure c
countResults :: Query -> IO (Either ClientError Count)
countResults q = do
countResults :: [Query] -> IO (Either ClientError Count)
countResults qs = do
-- First, estimate the total number of documents
eRes <- runHalAPIClient $ search (Just $ requestedFields Nothing) [q] Nothing (Just 0) (Just 1) :: IO (Either ClientError (Response Corpus))
eRes <- runHalAPIClient $ search (Just $ requestedFields Nothing) qs Nothing (Just 0) (Just 1) :: IO (Either ClientError (Response Corpus))
pure $ _numFound <$> eRes
requestedFields :: Maybe ISO639_1 -> Text
......
......@@ -23,22 +23,23 @@ data Corpus = Corpus
, _corpus_authors_names :: [Text]
, _corpus_authors_affiliations :: [Text]
, _corpus_struct_id :: [Int]
, _corpus_original :: Object
} deriving (Show, Generic)
L.makeLenses ''Corpus
instance Default Corpus where
def = Corpus "default Id" def def def def def def def def
def = Corpus "default Id" def def def def def def def def mempty
instance FromJSON Corpus where
parseJSON = withObject "Corpus" $ \o -> do
_corpus_docid <- (o .: "docid")
_corpus_title <- (o .: "title_s" <|> return [])
_corpus_abstract <- (o .: "en_abstract_s" <|> return [])
_corpus_date <- (o .:? "submittedDate_s")
_corpus_source <- (o .:? "source_s")
_corpus_authors_names <- (o .: "authFullName_s" <|> return [])
_corpus_authors_affiliations <- (o .: "authOrganism_s" <|> return [])
_corpus_struct_id <- (o .: "structId_i" <|> return [])
_corpus_docid <- o .: "docid"
_corpus_title <- o .: "title_s" <|> return []
_corpus_abstract <- o .: "en_abstract_s" <|> return []
_corpus_date <- o .:? "submittedDate_s"
_corpus_source <- o .:? "source_s"
_corpus_authors_names <- o .: "authFullName_s" <|> return []
_corpus_authors_affiliations <- o .: "authOrganism_s" <|> return []
_corpus_struct_id <- o .: "structId_i" <|> return []
abstracts <-
mapM (\lang -> do
......@@ -46,6 +47,8 @@ instance FromJSON Corpus where
pure $ (\a -> (lang, a)) <$> ma) allLangs
let _corpus_abstract_lang_map = Map.fromList $ catMaybes abstracts
let _corpus_original = o
pure $ Corpus { .. }
instance ToHttpApiData Corpus where
......
module HAL.Utils where
import Data.LanguageCodes (ISO639_1(..), language)
import Data.LanguageCodes (ISO639_1(..), toChars)
import Data.Text qualified as T
import Protolude
......@@ -8,4 +8,6 @@ allLangs :: [ISO639_1]
allLangs = enumFrom (toEnum 0) :: [ISO639_1]
langAbstractS :: ISO639_1 -> Text
langAbstractS lang = (T.pack $ language lang) <> "_abstract_s"
langAbstractS lang = (T.pack [l1, l2]) <> "_abstract_s"
where
(l1, l2) = toChars lang
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment