Commit 3d9b4e21 authored by Grégoire Locqueville's avatar Grégoire Locqueville

Removed dead code in `Core.Text...`

parent 91c73c45
......@@ -362,7 +362,6 @@ library
Gargantext.Core.Text.Corpus.Parsers.FrameWrite
Gargantext.Core.Text.Corpus.Parsers.GrandDebat
Gargantext.Core.Text.Corpus.Parsers.Iramuteq
Gargantext.Core.Text.Corpus.Parsers.Isidore
Gargantext.Core.Text.Corpus.Parsers.JSON
Gargantext.Core.Text.Corpus.Parsers.JSON.Istex
Gargantext.Core.Text.Corpus.Parsers.RIS
......@@ -530,7 +529,6 @@ library
, haskell-throttle
, hlcm ^>= 0.2.2
, hsinfomap ^>= 0.1
, hsparql ^>= 0.3.8
, hstatistics ^>= 0.3.1
, http-api-data >= 0.5 && < 0.6
, http-client ^>= 0.7.14
......@@ -573,7 +571,6 @@ library
, quickcheck-instances ^>= 0.3.25.2
, rake ^>= 0.0.1
, random ^>= 1.2.1
, rdf4h ^>= 3.1.1
, regex
, replace-attoparsec ^>= 1.5.0.0
, resource-pool >= 0.4.0.0 && < 0.5
......@@ -604,7 +601,6 @@ library
, stm-containers >= 1.2.0.3 && < 1.3
, stringsearch >= 0.3.6.6
, swagger2 ^>= 2.8.7
, tagsoup ^>= 0.14.8
, template-haskell ^>= 2.19.0.0
, temporary ^>= 1.3
, text ^>= 2.0.2
......
......@@ -15,7 +15,6 @@ Text gathers terms in unit of contexts.
module Gargantext.Core.Text
where
import Data.Text (split)
import Data.Text qualified as DT
import Gargantext.Prelude hiding (filter)
import NLP.FullStop (segment)
......@@ -85,18 +84,6 @@ instance Collage MultiTerme Mot where
sentences :: Text -> [Text]
sentences txt = map DT.pack $ segment $ DT.unpack txt
sentences' :: Text -> [Text]
sentences' txt = split isCharStop txt
isCharStop :: Char -> Bool
isCharStop c = c `elem` ['.','?','!']
unsentences :: [Text] -> Text
unsentences txts = DT.unwords txts
-- | Ngrams size
size :: Text -> Int
size t = 1 + DT.count " " t
......@@ -23,10 +23,7 @@ How to split contexts is describes in this module.
module Gargantext.Core.Text.Context
where
import Data.Text (pack, unpack)
import Gargantext.Core.Text
import Gargantext.Prelude hiding (length)
import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
------------------------------------------------------------------------
type Term = Text
......@@ -38,31 +35,5 @@ type TermList = [(Label, [MultiTerm])]
type Sentence a = [a] -- or a nominal group
type Corpus a = [Sentence a] -- a list of sentences
-- type ConText a = [Sentence a]
-- type Corpus a = [ConText a]
------------------------------------------------------------------------
-- | Contexts definition to build/unbuild contexts.
data SplitContext = Chars Int | Sentences Int | Paragraphs Int
-- | splitBy contexts of Chars or Sentences or Paragraphs
-- To see some examples at a higher level (sentences and paragraph), see
-- 'Gargantext.Core.Text.Examples.ex_terms'
--
-- >>> splitBy (Chars 0) (pack "abcde")
-- ["a","b","c","d","e"]
--
-- >>> splitBy (Chars 1) (pack "abcde")
-- ["ab","bc","cd","de"]
--
-- >>> splitBy (Chars 2) (pack "abcde")
-- ["abc","bcd","cde"]
splitBy :: SplitContext -> Text -> [Text]
splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack
splitBy (Sentences n) = map unsentences . chunkAlong (n+1) 1 . sentences
splitBy (Paragraphs _) = map unTag . filter isTagText . parseTags
where
unTag :: IsString p => Tag p -> p
unTag (TagText x) = x
unTag _ = ""
module Gargantext.Core.Text.Corpus (makeSubcorpusFromQuery, subcorpusEasy) where
module Gargantext.Core.Text.Corpus (makeSubcorpusFromQuery) where
import Control.Lens (view)
import Data.Set.Internal qualified as Set (singleton)
import Data.Text qualified as T
import Gargantext.API.Dev (runCmdReplEasy)
import Gargantext.API.Errors.Types (BackendInternalError(InternalNodeError))
import Gargantext.Core (Lang(EN))
import Gargantext.Core.NodeStory.Types (HasNodeStoryEnv)
......@@ -18,7 +16,7 @@ import Gargantext.Database.Action.Metrics (updateContextScore, updateNgramsOccur
import Gargantext.Database.Action.Search (searchInCorpus)
import Gargantext.Database.Action.User (getUserId)
import Gargantext.Database.Admin.Types.Hyperdata.Corpus (HyperdataCorpus, hc_lang)
import Gargantext.Database.Admin.Types.Node (CorpusId, NodeId(UnsafeMkNodeId), NodeType(..), nodeId2ContextId)
import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), nodeId2ContextId)
import Gargantext.Database.Prelude (DBCmdWithEnv)
import Gargantext.Database.Query.Facet.Types (facetDoc_id)
import Gargantext.Database.Query.Table.Node (insertDefaultNode, copyNodeStories, defaultList, getNodeWithType)
......@@ -28,21 +26,6 @@ import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Prelude
-- | A version of the below function for use in the REPL (so you don't need to
-- manually import tons of constructors etc.)
subcorpusEasy :: Text -- ^ Username
-> Int -- ^ Original corpus ID
-> Text -- ^ Search string
-> Bool -- ^ Whether to reuse the parent term list (True) or recompute one from scratch (False)
-> IO ()
subcorpusEasy username cId rawQuery reuseParentList = do
let eitherQuery = Q.parseQuery $ Q.RawQuery rawQuery
case eitherQuery of
Left msg -> print $ "Error parsing query \"" <> rawQuery <> "\": " <> T.pack msg
Right query -> void $ runCmdReplEasy $ makeSubcorpusFromQuery (UserName username) (UnsafeMkNodeId cId) query reuseParentList
-- | Given a "parent" corpus and a query, search for all docs in the parent
-- that match the query, and create a corpus from those. The created corpus
-- is inserted in the tree as a child of the parent corpus.
......
......@@ -23,13 +23,8 @@ import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude hiding (intercalate)
import HAL qualified
import HAL.Doc.Document qualified as HAL
import HAL.Types qualified as HAL
import Servant.Client (ClientError)
get :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO [HyperdataDocument]
get la q ml = do
eDocs <- HAL.getMetadataWith [q] (Just 0) (fromIntegral <$> ml) la
either (panicTrace . pack . show) (mapM (toDoc' la) . HAL._docs) eDocs
getC :: Maybe ISO639.ISO639_1 -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC la q ml = do
......
......@@ -11,22 +11,18 @@ Portability : POSIX
{-# LANGUAGE ScopedTypeVariables #-}
module Gargantext.Core.Text.Corpus.API.Isidore (
get
-- * Internals (possibly unused?)
, isidore2tsvFile
module Gargantext.Core.Text.Corpus.API.Isidore
( get
) where
import Data.Text qualified as Text
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers (cleanText)
import Gargantext.Core.Text.Corpus.Parsers.TSV (writeDocs2Tsv)
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude hiding (get)
import Isidore qualified as Isidore
import Isidore qualified
import Isidore.Client
import Servant.Client ( ClientError(DecodeFailure) )
......@@ -50,13 +46,6 @@ get lang l q a = do
hDocs <- mapM (isidoreToDoc lang) (toIsidoreDocs iDocs)
pure hDocs
isidore2tsvFile :: FilePath -> Lang -> Maybe Isidore.Limit
-> Maybe Isidore.TextQuery -> Maybe Isidore.AuthorQuery
-> IO ()
isidore2tsvFile fp lang li tq aq = do
hdocs <- get lang li tq aq
writeDocs2Tsv fp hdocs
isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument
isidoreToDoc lang (IsidoreDoc t a d u s as) = do
let
......
......@@ -10,12 +10,10 @@ Portability : POSIX
-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE LambdaCase #-}
module Gargantext.Core.Text.Corpus.API.Pubmed
( get
-- * Internals for testing
, ESearch(..)
, convertQuery
, getESearch
)
......
......@@ -29,7 +29,6 @@ module Gargantext.Core.Text.Corpus.Parsers (
, cleanText
, parseFormatC
, splitOn
, etale
) where
-- import Gargantext.Core.Text.Learn (detectLangDefault)
......@@ -49,7 +48,6 @@ import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.TSV (parseHal, parseTsv, parseTsvC)
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Core.Text.Corpus.Parsers.FrameWrite (text2titleParagraphs)
import Gargantext.Core.Text.Corpus.Parsers.Iramuteq qualified as Iramuteq
import Gargantext.Core.Text.Corpus.Parsers.JSON (parseJSONC, parseIstex)
import Gargantext.Core.Text.Corpus.Parsers.RIS qualified as RIS
......@@ -178,44 +176,6 @@ filterZIPFileNameP Istex f = (takeExtension (unEntrySelector f) == ".json") &&
filterZIPFileNameP _ _ = True
etale :: [HyperdataDocument] -> [HyperdataDocument]
etale = concatMap etale'
where
etale' :: HyperdataDocument -> [HyperdataDocument]
etale' h = map (\t -> h { _hd_abstract = Just t })
$ map snd
$ text2titleParagraphs 7 (maybe "" identity $ _hd_abstract h)
-- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
-- parseFormat TsvGargV3 bs = pure $ parseTsv' $ DBL.fromStrict bs
-- parseFormat TsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
-- parseFormat RisPresse bs = do
-- docs <- mapM (toDoc RIS)
-- <$> snd
-- <$> enrichWith RisPresse
-- $ partitionEithers
-- $ [runParser' RisPresse bs]
-- pure $ Right docs
-- parseFormat WOS bs = do
-- docs <- mapM (toDoc WOS)
-- <$> snd
-- <$> enrichWith WOS
-- $ partitionEithers
-- $ [runParser' WOS bs]
-- pure $ Right docs
-- parseFormat ZIP bs = do
-- path <- emptySystemTempFile "parsed-zip"
-- DB.writeFile path bs
-- parsedZip <- withArchive path $ do
-- DM.keys <$> getEntries
-- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
-- parseFormat _ _ = undefined
-- | Parse file into documents
-- TODO manage errors here
-- TODO: to debug maybe add the filepath in error message
parseFile :: FileType
-> FileFormat
-> FilePath
......
......@@ -21,16 +21,13 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
module Gargantext.Core.Text.Corpus.Parsers.Date (
dateSplit
, mDateSplit
, defaultDay
, defaultUTCTime
, split'
) where
import Data.List qualified as List
import Data.Text (unpack, splitOn, replace)
import Data.Time (defaultTimeLocale, iso8601DateFormat, parseTimeM, toGregorian)
import Data.Time.Calendar qualified as DTC
import Data.Time.Clock (UTCTime(..), secondsToDiffTime)
import Data.Time.Clock (UTCTime(..))
import Gargantext.Prelude hiding (replace)
------------------------------------------------------------------------
......@@ -113,11 +110,3 @@ readDate txt = do
--let format = cs $ iso8601DateFormat (Just "%F %H:%M:%S")
let format = cs $ iso8601DateFormat Nothing
parseTimeM True defaultTimeLocale (unpack format) (cs txt)
defaultDay :: DTC.Day
defaultDay = DTC.fromGregorian 1 1 1
defaultUTCTime :: UTCTime
defaultUTCTime = UTCTime { utctDay = defaultDay
, utctDayTime = secondsToDiffTime 0 }
......@@ -35,39 +35,6 @@ import Text.Parsec.String
-- par défaut: un doc == 1 NodeWrite
-- ## mean each ## section will be a new document with title the subsubsection title. Either it features options for author, date etc. or it will inherit the document's option.
sample :: Text
sample =
unlines
[ "title1"
-- , "title2"
-- , "=="
-- , "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
, "date: 2021-09-10"
, "source: someSource"
, "document contents 1"
, "document contents 2"
]
sampleUnordered :: Text
sampleUnordered =
unlines
[ "title1"
, "title2"
, "=="
, "document contents 1"
, "date: 2021-09-10"
, "authors: FirstName1, LastName1; FirstName2, LastName2"
, "source: someSource"
, "document contents 2"
]
-- parseSample = parse documentP "sample" (unpack sample)
-- parseSampleUnordered = parse documentP "sampleUnordered" (unpack sampleUnordered)
parseLinesSample :: Either ParseError Parsed
parseLinesSample = parseLines sample
parseLinesSampleUnordered :: Either ParseError Parsed
parseLinesSampleUnordered = parseLines sampleUnordered
data Author =
Author { firstName :: Text
, lastName :: Text }
......@@ -114,14 +81,6 @@ parseLines text = foldl f emptyParsed <$> lst
f (Parsed { .. }) (LSource s ) = Parsed { source = Just s , .. }
f (Parsed { .. }) (LTitle t ) = Parsed { title = t , .. }
-- Source should be the name of the node
-- First line of each Context should be the title.
documentLinesP :: Parser [Line]
documentLinesP = do
t <- titleP
ls <- lineP `sepBy` newline
pure $ [LTitle $ pack t] ++ ls
documentLines :: Parser [Line]
documentLines = do
ls <- lineP `sepBy` newline
......@@ -157,27 +116,6 @@ contentsLineP = do
--------------------
-- documentP = do
-- t <- titleP
-- a <- optionMaybe authorsP
-- d <- optionMaybe dateP
-- s <- optionMaybe sourceP
-- c <- contentsP
-- pure $ Parsed { title = pack t
-- , authors = fromMaybe [] a
-- , date = pack <$> d
-- , source = pack <$> s
-- , contents = pack c }
titleDelimiterP :: Parser ()
titleDelimiterP = do
_ <- newline
-- _ <- try (string "==")
pure ()
titleP :: Parser [Char]
titleP = manyTill anyChar (try titleDelimiterP)
authorsPrefixP :: Parser [Char]
authorsPrefixP = do
_ <- string "authors:"
......@@ -225,12 +163,6 @@ sourceP = try sourcePrefixP
_ <- string "source:"
many (char ' ')
-- contentsP :: Parser String
-- contentsP = many anyChar
tokenEnd :: Parser ()
tokenEnd = void (char '\n') <|> eof
--- MISC Tools
-- Using ChunkAlong here enable redundancies in short corpora of texts
-- maybe use splitEvery or chunkAlong depending on the size of the whole text
......@@ -249,8 +181,3 @@ text2titleParagraphs n = catMaybes
doTitle :: [Text] -> Maybe (Text, Text)
doTitle (t:ts) = Just (t, DT.unwords ts)
doTitle [] = Nothing
clean :: Text -> Text
clean = DT.unwords . List.filter (\w -> DT.length w < 25) . DT.words
......@@ -12,19 +12,18 @@ commentary with @some markup@.
-}
module Gargantext.Core.Text.Corpus.Parsers.Iramuteq (parseIramuteqFile, parser, keys) where
module Gargantext.Core.Text.Corpus.Parsers.Iramuteq
( parser
, keys
)
where
import Control.Applicative
import Data.Attoparsec.ByteString (Parser, takeTill, parseOnly)
import Data.Attoparsec.ByteString.Char8 (isEndOfLine, takeWhile, endOfLine)
import Data.ByteString (ByteString)
import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
import qualified Data.ByteString as DB
parseIramuteqFile :: FilePath -> IO (Either String [[(ByteString, ByteString)]])
parseIramuteqFile fp = do
txts <- DB.readFile fp
pure $ parseOnly parser txts
-------------------------------------------------------------
parser :: Parser [[(ByteString, ByteString)]]
......
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.Isidore
Description : To query French Humanities publication database
Copyright : (c) CNRS, 2019-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
TODO:
- put endpoint in configuration file
- more flexible fields of research
- type database name
- use more ontologies to help building corpora
-}
{-# OPTIONS_GHC -fno-warn-deprecations #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Gargantext.Core.Text.Corpus.Parsers.Isidore where
import Data.ByteString.Lazy (ByteString)
import Data.RDF ( Node(LNode, UNode), LValue(PlainLL, TypedL, PlainL) )
import Data.Text qualified as T
import Database.HSparql.Connection ( BindingValue(..), EndPoint, structureContent )
import Database.HSparql.QueryGenerator
import Gargantext.Core (Lang)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude hiding (ByteString)
import Network.Wreq (getWith, Response, defaults, header, param, responseStatus, responseBody)
import Prelude qualified
route :: EndPoint
route = "https://isidore.science/sparql/"
selectQueryRaw' :: Prelude.String -> Prelude.String -> IO (Response ByteString)
selectQueryRaw' uri q = getWith opts uri
where
opts = defaults & header "Accept" .~ ["application/sparql-results+xml"]
& header "User-Agent" .~ ["gargantext-hsparql-client"]
& param "query" .~ [T.pack q]
isidoreGet :: Lang -> Int -> Text -> IO (Maybe [HyperdataDocument])
isidoreGet la li q = do
bindingValues <- isidoreGet' li q
case bindingValues of
Nothing -> pure Nothing
Just dv -> pure $ Just $ map (bind2doc la) dv
isidoreGet' :: Int -> Text -> IO (Maybe [[BindingValue]])
isidoreGet' l q = do
let s = createSelectQuery $ isidoreSelect l q
putStrLn s
r <- selectQueryRaw' route s
putStrLn (show $ r ^. responseStatus :: Text)
pure $ structureContent $ r ^. responseBody
-- res <- selectQuery route $ simpleSelect q
-- pure res
isidoreSelect :: Int -> Text -> Query SelectQuery
isidoreSelect lim q = do
-- See Predefined Namespace Prefixes:
-- https://isidore.science/sparql?nsdecl
isidore <- prefix "isidore" (iriRef "http://isidore.science/class/")
rdf <- prefix "rdf" (iriRef "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
dcterms <- prefix "dcterms" (iriRef "http://purl.org/dc/terms/")
dc <- prefix "dc" (iriRef "http://purl.org/dc/elements/1.1/")
--iso <- prefix "fra" (iriRef "http://lexvo.org/id/iso639-3/")
--ore <- prefix "ore" (iriRef "http://www.openarchives.org/ore/terms/")
--bif <- prefix "bif" (iriRef "bif:")
link' <- var
title <- var
date <- var
abstract <- var
authors <- var
source <- var
langDoc <- var
publisher <- var
--agg <- var
triple_ link' (rdf .:. "type") (isidore .:. "Document")
triple_ link' (dcterms .:. "title") title
triple_ link' (dcterms .:. "date") date
triple_ link' (dcterms .:. "creator") authors
--triple_ link (dcterms .:. "language") langDoc
triple_ link' (dc .:. "description") abstract
--triple_ link (ore .:. "isAggregatedBy") agg
--triple_ agg (dcterms .:. "title") title
optional_ $ triple_ link' (dcterms .:. "source") source
optional_ $ triple_ link' (dcterms .:. "publisher") publisher
-- TODO FIX BUG with (.||.) operator
--filterExpr_ $ (.||.) (contains title q) (contains abstract q)
--filterExpr_ (containsWith authors q) -- (contains abstract q)
--filterExpr_ (containsWith title q) -- (contains abstract q)
--filterExpr_ $ (.||.) (containsWith title q) (contains abstract q)
filterExpr_ (containsWith title q)
-- TODO FIX filter with lang
--filterExpr_ $ langMatches title (str ("fra" :: Text))
--filterExpr_ $ (.==.) langDoc (str ("http://lexvo.org/id/iso639-3/fra" :: Text))
orderNextDesc date
limit_ lim
distinct_
selectVars [link', date, langDoc, authors, source, publisher, title, abstract]
-- | TODO : check if all cases are taken into account
unbound :: Lang -> BindingValue -> Maybe Text
unbound _ Unbound = Nothing
unbound _ (Bound (UNode x)) = Just x
unbound _ (Bound (LNode (TypedL x _))) = Just x
unbound _ (Bound (LNode (PlainL x))) = Just x
unbound l (Bound (LNode (PlainLL x l'))) = if l' == T.toLower (show l) then Just x else Nothing
unbound _ _ = Nothing
bind2doc :: Lang -> [BindingValue] -> HyperdataDocument
bind2doc l [ link', date, langDoc, authors, _source, publisher, title, abstract ] =
HyperdataDocument { _hd_bdd = Just "Isidore"
, _hd_doi = Nothing
, _hd_url = unbound l link'
, _hd_page = Nothing
, _hd_title = unbound l title
, _hd_authors = unbound l authors
, _hd_institutes = Nothing
, _hd_source = unbound l publisher
, _hd_abstract = unbound l abstract
, _hd_publication_date = unbound l date
, _hd_publication_year = Nothing
, _hd_publication_month = Nothing
, _hd_publication_day = Nothing
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = unbound l langDoc
, _hd_institutes_tree = Nothing }
bind2doc _ _ = undefined
......@@ -8,7 +8,6 @@ module Gargantext.Core.Text.Corpus.Query (
, QueryTerm(..)
, getQuery
, parseQuery
, mapQuery
, renderQuery
, renderQueryTerm
, interpretQuery
......@@ -130,6 +129,3 @@ parseQuery (RawQuery txt) = bimap show (Query . BoolExpr.boolTreeToCNF) $
renderQuery :: Query -> RawQuery
renderQuery (Query cnf) = RawQuery . T.pack $ BoolExpr.boolExprPrinter (showsPrec 0) (BoolExpr.fromCNF cnf) ""
mapQuery :: (QueryTerm -> QueryTerm) -> Query -> Query
mapQuery f = Query . fmap (map f) . getQuery
......@@ -22,6 +22,11 @@ roots = [ '^Main\.main$'
, '^Gargantext\.API\.Ngrams\.List\.importTsvFile$'
# Used by the tests
, '^Gargantext\.Core\.Text\.Corpus\.API\.Pubmed\.convertQuery$'
, '^Gargantext\.Core\.Text\.Corpus\.API\.Pubmed\.getESearch$'
# Template Haskell
# Weeder is not smart enough to know what functions will be used by
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment