Commit 3d9b4e21 authored by Grégoire Locqueville's avatar Grégoire Locqueville

Removed dead code in `Core.Text...`

parent 91c73c45
...@@ -362,7 +362,6 @@ library ...@@ -362,7 +362,6 @@ library
Gargantext.Core.Text.Corpus.Parsers.FrameWrite Gargantext.Core.Text.Corpus.Parsers.FrameWrite
Gargantext.Core.Text.Corpus.Parsers.GrandDebat Gargantext.Core.Text.Corpus.Parsers.GrandDebat
Gargantext.Core.Text.Corpus.Parsers.Iramuteq Gargantext.Core.Text.Corpus.Parsers.Iramuteq
Gargantext.Core.Text.Corpus.Parsers.Isidore
Gargantext.Core.Text.Corpus.Parsers.JSON Gargantext.Core.Text.Corpus.Parsers.JSON
Gargantext.Core.Text.Corpus.Parsers.JSON.Istex Gargantext.Core.Text.Corpus.Parsers.JSON.Istex
Gargantext.Core.Text.Corpus.Parsers.RIS Gargantext.Core.Text.Corpus.Parsers.RIS
...@@ -530,7 +529,6 @@ library ...@@ -530,7 +529,6 @@ library
, haskell-throttle , haskell-throttle
, hlcm ^>= 0.2.2 , hlcm ^>= 0.2.2
, hsinfomap ^>= 0.1 , hsinfomap ^>= 0.1
, hsparql ^>= 0.3.8
, hstatistics ^>= 0.3.1 , hstatistics ^>= 0.3.1
, http-api-data >= 0.5 && < 0.6 , http-api-data >= 0.5 && < 0.6
, http-client ^>= 0.7.14 , http-client ^>= 0.7.14
...@@ -573,7 +571,6 @@ library ...@@ -573,7 +571,6 @@ library
, quickcheck-instances ^>= 0.3.25.2 , quickcheck-instances ^>= 0.3.25.2
, rake ^>= 0.0.1 , rake ^>= 0.0.1
, random ^>= 1.2.1 , random ^>= 1.2.1
, rdf4h ^>= 3.1.1
, regex , regex
, replace-attoparsec ^>= 1.5.0.0 , replace-attoparsec ^>= 1.5.0.0
, resource-pool >= 0.4.0.0 && < 0.5 , resource-pool >= 0.4.0.0 && < 0.5
...@@ -604,7 +601,6 @@ library ...@@ -604,7 +601,6 @@ library
, stm-containers >= 1.2.0.3 && < 1.3 , stm-containers >= 1.2.0.3 && < 1.3
, stringsearch >= 0.3.6.6 , stringsearch >= 0.3.6.6
, swagger2 ^>= 2.8.7 , swagger2 ^>= 2.8.7
, tagsoup ^>= 0.14.8
, template-haskell ^>= 2.19.0.0 , template-haskell ^>= 2.19.0.0
, temporary ^>= 1.3 , temporary ^>= 1.3
, text ^>= 2.0.2 , text ^>= 2.0.2
......
...@@ -15,7 +15,6 @@ Text gathers terms in unit of contexts. ...@@ -15,7 +15,6 @@ Text gathers terms in unit of contexts.
module Gargantext.Core.Text module Gargantext.Core.Text
where where
import Data.Text (split)
import Data.Text qualified as DT import Data.Text qualified as DT
import Gargantext.Prelude hiding (filter) import Gargantext.Prelude hiding (filter)
import NLP.FullStop (segment) import NLP.FullStop (segment)
...@@ -85,18 +84,6 @@ instance Collage MultiTerme Mot where ...@@ -85,18 +84,6 @@ instance Collage MultiTerme Mot where
sentences :: Text -> [Text] sentences :: Text -> [Text]
sentences txt = map DT.pack $ segment $ DT.unpack txt sentences txt = map DT.pack $ segment $ DT.unpack txt
sentences' :: Text -> [Text]
sentences' txt = split isCharStop txt
isCharStop :: Char -> Bool
isCharStop c = c `elem` ['.','?','!']
unsentences :: [Text] -> Text
unsentences txts = DT.unwords txts
-- | Ngrams size -- | Ngrams size
size :: Text -> Int size :: Text -> Int
size t = 1 + DT.count " " t size t = 1 + DT.count " " t
...@@ -23,10 +23,7 @@ How to split contexts is describes in this module. ...@@ -23,10 +23,7 @@ How to split contexts is describes in this module.
module Gargantext.Core.Text.Context module Gargantext.Core.Text.Context
where where
import Data.Text (pack, unpack)
import Gargantext.Core.Text
import Gargantext.Prelude hiding (length) import Gargantext.Prelude hiding (length)
import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
------------------------------------------------------------------------ ------------------------------------------------------------------------
type Term = Text type Term = Text
...@@ -38,31 +35,5 @@ type TermList = [(Label, [MultiTerm])] ...@@ -38,31 +35,5 @@ type TermList = [(Label, [MultiTerm])]
type Sentence a = [a] -- or a nominal group type Sentence a = [a] -- or a nominal group
type Corpus a = [Sentence a] -- a list of sentences type Corpus a = [Sentence a] -- a list of sentences
-- type ConText a = [Sentence a]
-- type Corpus a = [ConText a]
------------------------------------------------------------------------
-- | Contexts definition to build/unbuild contexts. -- | Contexts definition to build/unbuild contexts.
data SplitContext = Chars Int | Sentences Int | Paragraphs Int data SplitContext = Chars Int | Sentences Int | Paragraphs Int
-- | splitBy contexts of Chars or Sentences or Paragraphs
-- To see some examples at a higher level (sentences and paragraph), see
-- 'Gargantext.Core.Text.Examples.ex_terms'
--
-- >>> splitBy (Chars 0) (pack "abcde")
-- ["a","b","c","d","e"]
--
-- >>> splitBy (Chars 1) (pack "abcde")
-- ["ab","bc","cd","de"]
--
-- >>> splitBy (Chars 2) (pack "abcde")
-- ["abc","bcd","cde"]
splitBy :: SplitContext -> Text -> [Text]
splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack
splitBy (Sentences n) = map unsentences . chunkAlong (n+1) 1 . sentences
splitBy (Paragraphs _) = map unTag . filter isTagText . parseTags
where
unTag :: IsString p => Tag p -> p
unTag (TagText x) = x
unTag _ = ""
module Gargantext.Core.Text.Corpus (makeSubcorpusFromQuery, subcorpusEasy) where module Gargantext.Core.Text.Corpus (makeSubcorpusFromQuery) where
import Control.Lens (view) import Control.Lens (view)
import Data.Set.Internal qualified as Set (singleton) import Data.Set.Internal qualified as Set (singleton)
import Data.Text qualified as T
import Gargantext.API.Dev (runCmdReplEasy)
import Gargantext.API.Errors.Types (BackendInternalError(InternalNodeError)) import Gargantext.API.Errors.Types (BackendInternalError(InternalNodeError))
import Gargantext.Core (Lang(EN)) import Gargantext.Core (Lang(EN))
import Gargantext.Core.NodeStory.Types (HasNodeStoryEnv) import Gargantext.Core.NodeStory.Types (HasNodeStoryEnv)
...@@ -18,7 +16,7 @@ import Gargantext.Database.Action.Metrics (updateContextScore, updateNgramsOccur ...@@ -18,7 +16,7 @@ import Gargantext.Database.Action.Metrics (updateContextScore, updateNgramsOccur
import Gargantext.Database.Action.Search (searchInCorpus) import Gargantext.Database.Action.Search (searchInCorpus)
import Gargantext.Database.Action.User (getUserId) import Gargantext.Database.Action.User (getUserId)
import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus, hc_lang) import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus, hc_lang)
import Gargantext.Database.Admin.Types.Node (CorpusId, NodeId(UnsafeMkNodeId), NodeType(..), nodeId2ContextId) import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), nodeId2ContextId)
import Gargantext.Database.Prelude (DBCmdWithEnv) import Gargantext.Database.Prelude (DBCmdWithEnv)
import Gargantext.Database.Query.Facet.Types (facetDoc_id) import Gargantext.Database.Query.Facet.Types (facetDoc_id)
import Gargantext.Database.Query.Table.Node (insertDefaultNode, copyNodeStories, defaultList, getNodeWithType) import Gargantext.Database.Query.Table.Node (insertDefaultNode, copyNodeStories, defaultList, getNodeWithType)
...@@ -28,21 +26,6 @@ import Gargantext.Database.Schema.Node (node_hyperdata) ...@@ -28,21 +26,6 @@ import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Prelude import Gargantext.Prelude
-- | A version of the below function for use in the REPL (so you don't need to
-- manually import tons of constructors etc.)
subcorpusEasy :: Text -- ^ Username
-> Int -- ^ Original corpus ID
-> Text -- ^ Search string
-> Bool -- ^ Whether to reuse the parent term list (True) or recompute one from scratch (False)
-> IO ()
subcorpusEasy username cId rawQuery reuseParentList = do
let eitherQuery = Q.parseQuery $ Q.RawQuery rawQuery
case eitherQuery of
Left msg -> print $ "Error parsing query \"" <> rawQuery <> "\": " <> T.pack msg
Right query -> void $ runCmdReplEasy $ makeSubcorpusFromQuery (UserName username) (UnsafeMkNodeId cId) query reuseParentList
-- | Given a "parent" corpus and a query, search for all docs in the parent -- | Given a "parent" corpus and a query, search for all docs in the parent
-- that match the query, and create a corpus from those. The created corpus -- that match the query, and create a corpus from those. The created corpus
-- is inserted in the tree as a child of the parent corpus. -- is inserted in the tree as a child of the parent corpus.
......
...@@ -23,13 +23,8 @@ import Gargantext.Defaults qualified as Defaults ...@@ -23,13 +23,8 @@ import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude hiding (intercalate) import Gargantext.Prelude hiding (intercalate)
import HAL qualified import HAL qualified
import HAL.Doc.Document qualified as HAL import HAL.Doc.Document qualified as HAL
import HAL.Types qualified as HAL
import Servant.Client (ClientError) import Servant.Client (ClientError)
get :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO [HyperdataDocument]
get la q ml = do
eDocs <- HAL.getMetadataWith [q] (Just 0) (fromIntegral <$> ml) la
either (panicTrace . pack . show) (mapM (toDoc' la) . HAL._docs) eDocs
getC :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ())) getC :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC la q ml = do getC la q ml = do
......
...@@ -11,22 +11,18 @@ Portability : POSIX ...@@ -11,22 +11,18 @@ Portability : POSIX
{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ScopedTypeVariables #-}
module Gargantext.Core.Text.Corpus.API.Isidore ( module Gargantext.Core.Text.Corpus.API.Isidore
get ( get
-- * Internals (possibly unused?)
, isidore2tsvFile
) where ) where
import Data.Text qualified as Text import Data.Text qualified as Text
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers (cleanText) import Gargantext.Core.Text.Corpus.Parsers (cleanText)
import Gargantext.Core.Text.Corpus.Parsers.TSV (writeDocs2Tsv)
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Defaults qualified as Defaults import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude hiding (get) import Gargantext.Prelude hiding (get)
import Isidore qualified as Isidore import Isidore qualified
import Isidore.Client import Isidore.Client
import Servant.Client ( ClientError(DecodeFailure) ) import Servant.Client ( ClientError(DecodeFailure) )
...@@ -50,13 +46,6 @@ get lang l q a = do ...@@ -50,13 +46,6 @@ get lang l q a = do
hDocs <- mapM (isidoreToDoc lang) (toIsidoreDocs iDocs) hDocs <- mapM (isidoreToDoc lang) (toIsidoreDocs iDocs)
pure hDocs pure hDocs
isidore2tsvFile :: FilePath -> Lang -> Maybe Isidore.Limit
-> Maybe Isidore.TextQuery -> Maybe Isidore.AuthorQuery
-> IO ()
isidore2tsvFile fp lang li tq aq = do
hdocs <- get lang li tq aq
writeDocs2Tsv fp hdocs
isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument
isidoreToDoc lang (IsidoreDoc t a d u s as) = do isidoreToDoc lang (IsidoreDoc t a d u s as) = do
let let
......
...@@ -10,12 +10,10 @@ Portability : POSIX ...@@ -10,12 +10,10 @@ Portability : POSIX
-} -}
{-# LANGUAGE DerivingStrategies #-} {-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE LambdaCase #-}
module Gargantext.Core.Text.Corpus.API.Pubmed module Gargantext.Core.Text.Corpus.API.Pubmed
( get ( get
-- * Internals for testing -- * Internals for testing
, ESearch(..)
, convertQuery , convertQuery
, getESearch , getESearch
) )
......
...@@ -29,7 +29,6 @@ module Gargantext.Core.Text.Corpus.Parsers ( ...@@ -29,7 +29,6 @@ module Gargantext.Core.Text.Corpus.Parsers (
, cleanText , cleanText
, parseFormatC , parseFormatC
, splitOn , splitOn
, etale
) where ) where
-- import Gargantext.Core.Text.Learn (detectLangDefault) -- import Gargantext.Core.Text.Learn (detectLangDefault)
...@@ -49,7 +48,6 @@ import Gargantext.API.Node.Corpus.New.Types (FileFormat(..)) ...@@ -49,7 +48,6 @@ import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.TSV (parseHal, parseTsv, parseTsvC) import Gargantext.Core.Text.Corpus.Parsers.TSV (parseHal, parseTsv, parseTsvC)
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Core.Text.Corpus.Parsers.FrameWrite (text2titleParagraphs)
import Gargantext.Core.Text.Corpus.Parsers.Iramuteq qualified as Iramuteq import Gargantext.Core.Text.Corpus.Parsers.Iramuteq qualified as Iramuteq
import Gargantext.Core.Text.Corpus.Parsers.JSON (parseJSONC, parseIstex) import Gargantext.Core.Text.Corpus.Parsers.JSON (parseJSONC, parseIstex)
import Gargantext.Core.Text.Corpus.Parsers.RIS qualified as RIS import Gargantext.Core.Text.Corpus.Parsers.RIS qualified as RIS
...@@ -178,44 +176,6 @@ filterZIPFileNameP Istex f = (takeExtension (unEntrySelector f) == ".json") && ...@@ -178,44 +176,6 @@ filterZIPFileNameP Istex f = (takeExtension (unEntrySelector f) == ".json") &&
filterZIPFileNameP _ _ = True filterZIPFileNameP _ _ = True
etale :: [HyperdataDocument] -> [HyperdataDocument]
etale = concatMap etale'
where
etale' :: HyperdataDocument -> [HyperdataDocument]
etale' h = map (\t -> h { _hd_abstract = Just t })
$ map snd
$ text2titleParagraphs 7 (maybe "" identity $ _hd_abstract h)
-- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
-- parseFormat TsvGargV3 bs = pure $ parseTsv' $ DBL.fromStrict bs
-- parseFormat TsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
-- parseFormat RisPresse bs = do
-- docs <- mapM (toDoc RIS)
-- <$> snd
-- <$> enrichWith RisPresse
-- $ partitionEithers
-- $ [runParser' RisPresse bs]
-- pure $ Right docs
-- parseFormat WOS bs = do
-- docs <- mapM (toDoc WOS)
-- <$> snd
-- <$> enrichWith WOS
-- $ partitionEithers
-- $ [runParser' WOS bs]
-- pure $ Right docs
-- parseFormat ZIP bs = do
-- path <- emptySystemTempFile "parsed-zip"
-- DB.writeFile path bs
-- parsedZip <- withArchive path $ do
-- DM.keys <$> getEntries
-- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
-- parseFormat _ _ = undefined
-- | Parse file into documents
-- TODO manage errors here
-- TODO: to debug maybe add the filepath in error message
parseFile :: FileType parseFile :: FileType
-> FileFormat -> FileFormat
-> FilePath -> FilePath
......
...@@ -21,16 +21,13 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00" ...@@ -21,16 +21,13 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
module Gargantext.Core.Text.Corpus.Parsers.Date ( module Gargantext.Core.Text.Corpus.Parsers.Date (
dateSplit dateSplit
, mDateSplit , mDateSplit
, defaultDay
, defaultUTCTime
, split' , split'
) where ) where
import Data.List qualified as List import Data.List qualified as List
import Data.Text (unpack, splitOn, replace) import Data.Text (unpack, splitOn, replace)
import Data.Time (defaultTimeLocale, iso8601DateFormat, parseTimeM, toGregorian) import Data.Time (defaultTimeLocale, iso8601DateFormat, parseTimeM, toGregorian)
import Data.Time.Calendar qualified as DTC import Data.Time.Clock (UTCTime(..))
import Data.Time.Clock (UTCTime(..), secondsToDiffTime)
import Gargantext.Prelude hiding (replace) import Gargantext.Prelude hiding (replace)
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -113,11 +110,3 @@ readDate txt = do ...@@ -113,11 +110,3 @@ readDate txt = do
--let format = cs $ iso8601DateFormat (Just "%F %H:%M:%S") --let format = cs $ iso8601DateFormat (Just "%F %H:%M:%S")
let format = cs $ iso8601DateFormat Nothing let format = cs $ iso8601DateFormat Nothing
parseTimeM True defaultTimeLocale (unpack format) (cs txt) parseTimeM True defaultTimeLocale (unpack format) (cs txt)
defaultDay :: DTC.Day
defaultDay = DTC.fromGregorian 1 1 1
defaultUTCTime :: UTCTime
defaultUTCTime = UTCTime { utctDay = defaultDay
, utctDayTime = secondsToDiffTime 0 }
...@@ -35,39 +35,6 @@ import Text.Parsec.String ...@@ -35,39 +35,6 @@ import Text.Parsec.String
-- par défaut: un doc == 1 NodeWrite -- par défaut: un doc == 1 NodeWrite
-- ## mean each ## section will be a new document with title the subsubsection title. Either it features options for author, date etc. or it will inherit the document's option. -- ## mean each ## section will be a new document with title the subsubsection title. Either it features options for author, date etc. or it will inherit the document's option.
sample :: Text
sample =
unlines
[ "title1"
-- , "title2"
-- , "=="
-- , "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
, "date: 2021-09-10"
, "source: someSource"
, "document contents 1"
, "document contents 2"
]
sampleUnordered :: Text
sampleUnordered =
unlines
[ "title1"
, "title2"
, "=="
, "document contents 1"
, "date: 2021-09-10"
, "authors: FirstName1, LastName1; FirstName2, LastName2"
, "source: someSource"
, "document contents 2"
]
-- parseSample = parse documentP "sample" (unpack sample)
-- parseSampleUnordered = parse documentP "sampleUnordered" (unpack sampleUnordered)
parseLinesSample :: Either ParseError Parsed
parseLinesSample = parseLines sample
parseLinesSampleUnordered :: Either ParseError Parsed
parseLinesSampleUnordered = parseLines sampleUnordered
data Author = data Author =
Author { firstName :: Text Author { firstName :: Text
, lastName :: Text } , lastName :: Text }
...@@ -114,14 +81,6 @@ parseLines text = foldl f emptyParsed <$> lst ...@@ -114,14 +81,6 @@ parseLines text = foldl f emptyParsed <$> lst
f (Parsed { .. }) (LSource s ) = Parsed { source = Just s , .. } f (Parsed { .. }) (LSource s ) = Parsed { source = Just s , .. }
f (Parsed { .. }) (LTitle t ) = Parsed { title = t , .. } f (Parsed { .. }) (LTitle t ) = Parsed { title = t , .. }
-- Source should be the name of the node
-- First line of each Context should be the title.
documentLinesP :: Parser [Line]
documentLinesP = do
t <- titleP
ls <- lineP `sepBy` newline
pure $ [LTitle $ pack t] ++ ls
documentLines :: Parser [Line] documentLines :: Parser [Line]
documentLines = do documentLines = do
ls <- lineP `sepBy` newline ls <- lineP `sepBy` newline
...@@ -157,27 +116,6 @@ contentsLineP = do ...@@ -157,27 +116,6 @@ contentsLineP = do
-------------------- --------------------
-- documentP = do
-- t <- titleP
-- a <- optionMaybe authorsP
-- d <- optionMaybe dateP
-- s <- optionMaybe sourceP
-- c <- contentsP
-- pure $ Parsed { title = pack t
-- , authors = fromMaybe [] a
-- , date = pack <$> d
-- , source = pack <$> s
-- , contents = pack c }
titleDelimiterP :: Parser ()
titleDelimiterP = do
_ <- newline
-- _ <- try (string "==")
pure ()
titleP :: Parser [Char]
titleP = manyTill anyChar (try titleDelimiterP)
authorsPrefixP :: Parser [Char] authorsPrefixP :: Parser [Char]
authorsPrefixP = do authorsPrefixP = do
_ <- string "authors:" _ <- string "authors:"
...@@ -225,12 +163,6 @@ sourceP = try sourcePrefixP ...@@ -225,12 +163,6 @@ sourceP = try sourcePrefixP
_ <- string "source:" _ <- string "source:"
many (char ' ') many (char ' ')
-- contentsP :: Parser String
-- contentsP = many anyChar
tokenEnd :: Parser ()
tokenEnd = void (char '\n') <|> eof
--- MISC Tools --- MISC Tools
-- Using ChunkAlong here enable redundancies in short corpora of texts -- Using ChunkAlong here enable redundancies in short corpora of texts
-- maybe use splitEvery or chunkAlong depending on the size of the whole text -- maybe use splitEvery or chunkAlong depending on the size of the whole text
...@@ -249,8 +181,3 @@ text2titleParagraphs n = catMaybes ...@@ -249,8 +181,3 @@ text2titleParagraphs n = catMaybes
doTitle :: [Text] -> Maybe (Text, Text) doTitle :: [Text] -> Maybe (Text, Text)
doTitle (t:ts) = Just (t, DT.unwords ts) doTitle (t:ts) = Just (t, DT.unwords ts)
doTitle [] = Nothing doTitle [] = Nothing
clean :: Text -> Text
clean = DT.unwords . List.filter (\w -> DT.length w < 25) . DT.words
...@@ -12,19 +12,18 @@ commentary with @some markup@. ...@@ -12,19 +12,18 @@ commentary with @some markup@.
-} -}
module Gargantext.Core.Text.Corpus.Parsers.Iramuteq (parseIramuteqFile, parser, keys) where module Gargantext.Core.Text.Corpus.Parsers.Iramuteq
( parser
, keys
)
where
import Control.Applicative import Control.Applicative
import Data.Attoparsec.ByteString (Parser, takeTill, parseOnly) import Data.Attoparsec.ByteString (Parser, takeTill, parseOnly)
import Data.Attoparsec.ByteString.Char8 (isEndOfLine, takeWhile, endOfLine) import Data.Attoparsec.ByteString.Char8 (isEndOfLine, takeWhile, endOfLine)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import Prelude hiding (takeWhile, take, concat, readFile, lines, concat) import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
import qualified Data.ByteString as DB
parseIramuteqFile :: FilePath -> IO (Either String [[(ByteString, ByteString)]])
parseIramuteqFile fp = do
txts <- DB.readFile fp
pure $ parseOnly parser txts
------------------------------------------------------------- -------------------------------------------------------------
parser :: Parser [[(ByteString, ByteString)]] parser :: Parser [[(ByteString, ByteString)]]
......
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.Isidore
Description : To query French Humanities publication database
Copyright : (c) CNRS, 2019-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
TODO:
- put endpoint in configuration file
- more flexible fields of research
- type database name
- use more ontologies to help building corpora
-}
{-# OPTIONS_GHC -fno-warn-deprecations #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Gargantext.Core.Text.Corpus.Parsers.Isidore where
import Data.ByteString.Lazy (ByteString)
import Data.RDF ( Node(LNode, UNode), LValue(PlainLL, TypedL, PlainL) )
import Data.Text qualified as T
import Database.HSparql.Connection ( BindingValue(..), EndPoint, structureContent )
import Database.HSparql.QueryGenerator
import Gargantext.Core (Lang)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude hiding (ByteString)
import Network.Wreq (getWith, Response, defaults, header, param, responseStatus, responseBody)
import Prelude qualified
route :: EndPoint
route = "https://isidore.science/sparql/"
selectQueryRaw' :: Prelude.String -> Prelude.String -> IO (Response ByteString)
selectQueryRaw' uri q = getWith opts uri
where
opts = defaults & header "Accept" .~ ["application/sparql-results+xml"]
& header "User-Agent" .~ ["gargantext-hsparql-client"]
& param "query" .~ [T.pack q]
isidoreGet :: Lang -> Int -> Text -> IO (Maybe [HyperdataDocument])
isidoreGet la li q = do
bindingValues <- isidoreGet' li q
case bindingValues of
Nothing -> pure Nothing
Just dv -> pure $ Just $ map (bind2doc la) dv
isidoreGet' :: Int -> Text -> IO (Maybe [[BindingValue]])
isidoreGet' l q = do
let s = createSelectQuery $ isidoreSelect l q
putStrLn s
r <- selectQueryRaw' route s
putStrLn (show $ r ^. responseStatus :: Text)
pure $ structureContent $ r ^. responseBody
-- res <- selectQuery route $ simpleSelect q
-- pure res
isidoreSelect :: Int -> Text -> Query SelectQuery
isidoreSelect lim q = do
-- See Predefined Namespace Prefixes:
-- https://isidore.science/sparql?nsdecl
isidore <- prefix "isidore" (iriRef "http://isidore.science/class/")
rdf <- prefix "rdf" (iriRef "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
dcterms <- prefix "dcterms" (iriRef "http://purl.org/dc/terms/")
dc <- prefix "dc" (iriRef "http://purl.org/dc/elements/1.1/")
--iso <- prefix "fra" (iriRef "http://lexvo.org/id/iso639-3/")
--ore <- prefix "ore" (iriRef "http://www.openarchives.org/ore/terms/")
--bif <- prefix "bif" (iriRef "bif:")
link' <- var
title <- var
date <- var
abstract <- var
authors <- var
source <- var
langDoc <- var
publisher <- var
--agg <- var
triple_ link' (rdf .:. "type") (isidore .:. "Document")
triple_ link' (dcterms .:. "title") title
triple_ link' (dcterms .:. "date") date
triple_ link' (dcterms .:. "creator") authors
--triple_ link (dcterms .:. "language") langDoc
triple_ link' (dc .:. "description") abstract
--triple_ link (ore .:. "isAggregatedBy") agg
--triple_ agg (dcterms .:. "title") title
optional_ $ triple_ link' (dcterms .:. "source") source
optional_ $ triple_ link' (dcterms .:. "publisher") publisher
-- TODO FIX BUG with (.||.) operator
--filterExpr_ $ (.||.) (contains title q) (contains abstract q)
--filterExpr_ (containsWith authors q) -- (contains abstract q)
--filterExpr_ (containsWith title q) -- (contains abstract q)
--filterExpr_ $ (.||.) (containsWith title q) (contains abstract q)
filterExpr_ (containsWith title q)
-- TODO FIX filter with lang
--filterExpr_ $ langMatches title (str ("fra" :: Text))
--filterExpr_ $ (.==.) langDoc (str ("http://lexvo.org/id/iso639-3/fra" :: Text))
orderNextDesc date
limit_ lim
distinct_
selectVars [link', date, langDoc, authors, source, publisher, title, abstract]
-- | TODO : check if all cases are taken into account
unbound :: Lang -> BindingValue -> Maybe Text
unbound _ Unbound = Nothing
unbound _ (Bound (UNode x)) = Just x
unbound _ (Bound (LNode (TypedL x _))) = Just x
unbound _ (Bound (LNode (PlainL x))) = Just x
unbound l (Bound (LNode (PlainLL x l'))) = if l' == T.toLower (show l) then Just x else Nothing
unbound _ _ = Nothing
bind2doc :: Lang -> [BindingValue] -> HyperdataDocument
bind2doc l [ link', date, langDoc, authors, _source, publisher, title, abstract ] =
HyperdataDocument { _hd_bdd = Just "Isidore"
, _hd_doi = Nothing
, _hd_url = unbound l link'
, _hd_page = Nothing
, _hd_title = unbound l title
, _hd_authors = unbound l authors
, _hd_institutes = Nothing
, _hd_source = unbound l publisher
, _hd_abstract = unbound l abstract
, _hd_publication_date = unbound l date
, _hd_publication_year = Nothing
, _hd_publication_month = Nothing
, _hd_publication_day = Nothing
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = unbound l langDoc
, _hd_institutes_tree = Nothing }
bind2doc _ _ = undefined
...@@ -8,7 +8,6 @@ module Gargantext.Core.Text.Corpus.Query ( ...@@ -8,7 +8,6 @@ module Gargantext.Core.Text.Corpus.Query (
, QueryTerm(..) , QueryTerm(..)
, getQuery , getQuery
, parseQuery , parseQuery
, mapQuery
, renderQuery , renderQuery
, renderQueryTerm , renderQueryTerm
, interpretQuery , interpretQuery
...@@ -130,6 +129,3 @@ parseQuery (RawQuery txt) = bimap show (Query . BoolExpr.boolTreeToCNF) $ ...@@ -130,6 +129,3 @@ parseQuery (RawQuery txt) = bimap show (Query . BoolExpr.boolTreeToCNF) $
renderQuery :: Query -> RawQuery renderQuery :: Query -> RawQuery
renderQuery (Query cnf) = RawQuery . T.pack $ BoolExpr.boolExprPrinter (showsPrec 0) (BoolExpr.fromCNF cnf) "" renderQuery (Query cnf) = RawQuery . T.pack $ BoolExpr.boolExprPrinter (showsPrec 0) (BoolExpr.fromCNF cnf) ""
mapQuery :: (QueryTerm -> QueryTerm) -> Query -> Query
mapQuery f = Query . fmap (map f) . getQuery
...@@ -22,6 +22,11 @@ roots = [ '^Main\.main$' ...@@ -22,6 +22,11 @@ roots = [ '^Main\.main$'
, '^Gargantext\.API\.Ngrams\.List\.importTsvFile$' , '^Gargantext\.API\.Ngrams\.List\.importTsvFile$'
# Used by the tests
, '^Gargantext\.Core\.Text\.Corpus\.API\.Pubmed\.convertQuery$'
, '^Gargantext\.Core\.Text\.Corpus\.API\.Pubmed\.getESearch$'
# Template Haskell # Template Haskell
# Weeder is not smart enough to know what functions will be used by # Weeder is not smart enough to know what functions will be used by
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment