Reconstruct abstract from the index

parent d7283561
{-|
Module : Main
Description : Textmining Collaborative Platform
Copyright : (c) CNRS, 2023-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
@Gargantext@: search, map, share
-}
module Main where
import Conduit
......@@ -55,8 +68,9 @@ fetchWorksC fltr _ = do
Right (mCount, c) -> do
putText $ "Count: " <> show mCount
_ <- runConduit $ c
.| takeC 1000
.| mapM_C (\(OA.Work { .. }) -> do
.| takeC 3
.| mapM_C (\w@(OA.Work { .. }) -> do
liftIO $ putText $ show id <> " :: " <> show display_name
liftIO $ putText abstract_reconstructed
)
pure ()
......@@ -37,6 +37,7 @@ library
OpenAlex.Client
OpenAlex.ServantClientLogging
OpenAlex.Types
OpenAlex.Utils
default-extensions:
DataKinds
DeriveGeneric
......
......@@ -21,6 +21,7 @@ import qualified Data.Text as T
import Data.Time (UTCTime)
import Data.Time.Calendar (Day)
import qualified Data.Time.Format as DTF
import OpenAlex.Utils (reconstructAbstract)
import Protolude hiding (Location, Meta)
-- API request types
......@@ -201,6 +202,7 @@ instance FromJSON SummaryStats where
-- | https://docs.openalex.org/api-entities/works/work-object
data Work = Work
{ abstract_inverted_index :: Maybe (Map Text [Int]) -- TODO
, abstract_reconstructed :: Text
, authorships :: [Authorship]
, apc_list :: Maybe APCList
, apc_paid :: Maybe APCPaid
......@@ -242,6 +244,7 @@ data Work = Work
instance FromJSON Work where
parseJSON = withObject "Work" $ \v -> do
abstract_inverted_index <- v .: "abstract_inverted_index"
let abstract_reconstructed = reconstructAbstract abstract_inverted_index
authorships <- v .: "authorships"
apc_list <- v .: "apc_list"
apc_paid <- v .: "apc_paid"
......
{-|
Module : OpenAlex Utils
Description : Textmining Collaborative Platform
Copyright : (c) CNRS, 2023
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module OpenAlex.Utils where
import qualified Data.Map.Strict as Map
import qualified Data.Text as T
import Protolude
-- | https://docs.openalex.org/api-entities/works/work-object#abstract_inverted_index
-- https://en.wikipedia.org/wiki/Inverted_index
-- For legal reasons, OpenAlex doesn't include abstract full text. But
-- we can reconstruct most of it from the index.
-- The index is of form: { word : [ positions-in-text ] }
reconstructAbstract :: Maybe (Map Text [Int]) -> Text
reconstructAbstract Nothing = ""
reconstructAbstract (Just m) = T.intercalate " " $ snd <$> sort wordPositions
where
wordPositions = concatMap (\(word, positions) -> (\pos -> (pos, word)) <$> positions) $ Map.toList m
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment