Commit 16327c34 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] CSV ngrams extraction, engine and search.

parent 7997ab36
{-|
Module : CleanCsvCorpus.hs
Description : Gargantext starter
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Given a Gargantext CSV File and its Query
This script cleans and compress the contexts around the main terms of the query.
-}
{-# LANGUAGE NoImplicitPrelude #-}
module CleanCsvCorpus where
--import GHC.IO (FilePath)
import Data.SearchEngine as S
import qualified Data.Set as S
import Data.Vector (Vector)
import qualified Data.Vector as V
import Gargantext.Prelude
import Gargantext.Text.Search
import Gargantext.Text.Parsers.CSV
------------------------------------------------------------------------
type Query = [S.Term]
filterDocs :: [DocId] -> Vector Doc -> Vector Doc
filterDocs docIds = V.filter (\doc -> S.member (d_docId doc) $ S.fromList docIds )
main :: IO ()
main = do
let rPath = "/tmp/Gargantext_Corpus.csv"
let wPath = "/tmp/Gargantext_Corpus_bis.csv"
--let q = ["water", "scarcity", "morocco", "shortage","flood"]
let q = ["gratuit", "gratuité", "culture", "culturel"]
(h,csvDocs) <- readCsv rPath
putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
putStrLn $ "Mean size of docs:" <> show ( docsSize csvDocs)
let docs = toDocs csvDocs
let engine = insertDocs docs initialDocSearchEngine
let docIds = S.query engine q
let docs' = fromDocs $ filterDocs docIds (V.fromList docs)
putStrLn $ "Number of documents after:" <> show (V.length docs')
putStrLn $ "Mean size of docs:" <> show (docsSize docs')
writeCsv wPath (h, docs')
...@@ -6,9 +6,10 @@ License : AGPL + CECILL v3 ...@@ -6,9 +6,10 @@ License : AGPL + CECILL v3
Maintainer : team@gargantext.org Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
-}
Script to start gargantext with different modes (Dev, Prod, Mock).
-}
{-# LANGUAGE DataKinds #-} {-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveGeneric #-}
...@@ -17,7 +18,6 @@ Portability : POSIX ...@@ -17,7 +18,6 @@ Portability : POSIX
{-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeOperators #-} {-# LANGUAGE TypeOperators #-}
module Main where module Main where
import Prelude (putStrLn) import Prelude (putStrLn)
...@@ -47,7 +47,6 @@ deriving instance Show (MyOptions Unwrapped) ...@@ -47,7 +47,6 @@ deriving instance Show (MyOptions Unwrapped)
main :: IO () main :: IO ()
main = do main = do
MyOptions myMode myPort myIniFile <- unwrapRecord MyOptions myMode myPort myIniFile <- unwrapRecord
"Gargantext: collaborative platform for text-mining" "Gargantext: collaborative platform for text-mining"
......
...@@ -58,6 +58,7 @@ library: ...@@ -58,6 +58,7 @@ library:
- duckling - duckling
- exceptions - exceptions
- filepath - filepath
- fullstop
- fclabels - fclabels
- fast-logger - fast-logger
- full-text-search - full-text-search
......
...@@ -23,8 +23,6 @@ module Gargantext.API.Node ...@@ -23,8 +23,6 @@ module Gargantext.API.Node
where where
------------------------------------------------------------------- -------------------------------------------------------------------
import System.IO (putStrLn)
import Control.Monad.IO.Class (liftIO) import Control.Monad.IO.Class (liftIO)
import Control.Monad ((>>)) import Control.Monad ((>>))
--import System.IO (putStrLn, readFile) --import System.IO (putStrLn, readFile)
...@@ -98,13 +96,13 @@ type FacetDocAPI = "table" ...@@ -98,13 +96,13 @@ type FacetDocAPI = "table"
-- | Node API functions -- | Node API functions
roots :: Connection -> Server Roots roots :: Connection -> Server Roots
roots conn = liftIO (putStrLn "Log Needed" >> getNodesWithParentId conn 0 Nothing) roots conn = liftIO (putStrLn ( "Log Needed" :: Text) >> getNodesWithParentId conn 0 Nothing)
:<|> pure (panic "not implemented yet") :<|> pure (panic "not implemented yet")
:<|> pure (panic "not implemented yet") :<|> pure (panic "not implemented yet")
:<|> pure (panic "not implemented yet") :<|> pure (panic "not implemented yet")
nodeAPI :: Connection -> NodeId -> Server NodeAPI nodeAPI :: Connection -> NodeId -> Server NodeAPI
nodeAPI conn id = liftIO (putStrLn "getNode" >> getNode conn id ) nodeAPI conn id = liftIO (putStrLn ("getNode" :: Text) >> getNode conn id )
:<|> deleteNode' conn id :<|> deleteNode' conn id
:<|> getNodesWith' conn id :<|> getNodesWith' conn id
:<|> getFacet conn id :<|> getFacet conn id
......
...@@ -33,7 +33,6 @@ import Database.PostgreSQL.Simple (Connection, connect) ...@@ -33,7 +33,6 @@ import Database.PostgreSQL.Simple (Connection, connect)
import Opaleye (Query, Unpackspec, showSqlForPostgres) import Opaleye (Query, Unpackspec, showSqlForPostgres)
import Data.Profunctor.Product.Default (Default) import Data.Profunctor.Product.Default (Default)
import Data.Maybe (maybe) import Data.Maybe (maybe)
import Prelude (id, putStrLn)
-- TODO add a reader Monad here -- TODO add a reader Monad here
-- read this in the init file -- read this in the init file
...@@ -61,5 +60,5 @@ connectGargandb fp = do ...@@ -61,5 +60,5 @@ connectGargandb fp = do
connect parameters connect parameters
printSql :: Default Unpackspec a a => Query a -> IO () printSql :: Default Unpackspec a a => Query a -> IO ()
printSql = putStrLn . maybe "Empty query" id . showSqlForPostgres printSql = putStrLn . maybe "Empty query" identity . showSqlForPostgres
...@@ -30,8 +30,10 @@ module Gargantext.Prelude ...@@ -30,8 +30,10 @@ module Gargantext.Prelude
import Data.Maybe (isJust, fromJust, maybe) import Data.Maybe (isJust, fromJust, maybe)
import Protolude ( Bool(True, False), Int, Double, Integer import Protolude ( Bool(True, False), Int, Double, Integer
, Fractional, Num, Maybe(Just,Nothing) , Fractional, Num, Maybe(Just,Nothing)
, Enum, Bounded, Float
, Floating, Char, IO , Floating, Char, IO
, pure, (<*>), (<$>), panic , pure, (<*>), (<$>), panic
, putStrLn
, head, flip , head, flip
, Ord, Integral, Foldable, RealFrac, Monad, filter , Ord, Integral, Foldable, RealFrac, Monad, filter
, reverse, map, zip, drop, take, zipWith , reverse, map, zip, drop, take, zipWith
......
...@@ -32,6 +32,7 @@ import Data.Map.Strict (Map ...@@ -32,6 +32,7 @@ import Data.Map.Strict (Map
import Data.Text (Text, split) import Data.Text (Text, split)
import qualified Data.Map.Strict as M (filter) import qualified Data.Map.Strict as M (filter)
import NLP.FullStop (segment)
----------------------------------------------------------------- -----------------------------------------------------------------
import Gargantext.Text.Ngrams import Gargantext.Text.Ngrams
import Gargantext.Text.Metrics.Occurrences import Gargantext.Text.Metrics.Occurrences
...@@ -89,11 +90,16 @@ text2fis n xs = list2fis n (map ngrams xs) ...@@ -89,11 +90,16 @@ text2fis n xs = list2fis n (map ngrams xs)
------------------------------------------------------------------- -------------------------------------------------------------------
-- Contexts of text -- Contexts of text
sentences :: Text -> [Text] sentences :: Text -> [Text]
sentences txt = split isStop txt sentences txt = map DT.pack $ segment $ DT.unpack txt
sentences' :: Text -> [Text]
sentences' txt = split isStop txt
isStop :: Char -> Bool isStop :: Char -> Bool
isStop c = c `elem` ['.','?','!'] isStop c = c `elem` ['.','?','!']
unsentences :: [Text] -> Text
unsentences txts = DT.intercalate " " txts
-- | https://en.wikipedia.org/wiki/Text_mining -- | https://en.wikipedia.org/wiki/Text_mining
testText_en :: Text testText_en :: Text
......
...@@ -26,7 +26,7 @@ module Gargantext.Text.Ngrams ...@@ -26,7 +26,7 @@ module Gargantext.Text.Ngrams
where where
import Data.Char (Char, isAlphaNum, isSpace) import Data.Char (Char, isAlphaNum, isSpace)
import Data.Text (Text, split, splitOn, pack) import Data.Text (Text, split, splitOn, pack, toLower)
import Data.Set (Set) import Data.Set (Set)
import qualified Data.Set as S import qualified Data.Set as S
...@@ -64,7 +64,7 @@ equivNgrams (Ngrams _ s1) (Ngrams _ s2) = s1 `S.isSubsetOf` s2 ...@@ -64,7 +64,7 @@ equivNgrams (Ngrams _ s1) (Ngrams _ s2) = s1 `S.isSubsetOf` s2
--monograms xs = monograms $ toLower $ filter isGram xs --monograms xs = monograms $ toLower $ filter isGram xs
monograms :: Text -> [Text] monograms :: Text -> [Text]
monograms txt = split isWord txt monograms txt = map toLower $ split isWord txt
where where
isWord c = c `elem` [' ', '\'', ',', ';'] isWord c = c `elem` [' ', '\'', ',', ';']
......
...@@ -18,7 +18,7 @@ Source : https://en.wikipedia.org/wiki/Stemming ...@@ -18,7 +18,7 @@ Source : https://en.wikipedia.org/wiki/Stemming
-} -}
module Gargantext.Text.Ngrams.Stem module Gargantext.Text.Ngrams.Stem (stem, Lang(..))
where where
import Data.Text (Text) import Data.Text (Text)
...@@ -38,7 +38,6 @@ import Gargantext.Core (Lang(..)) ...@@ -38,7 +38,6 @@ import Gargantext.Core (Lang(..))
-- A stemmer for English, for example, should identify the string "cats" -- A stemmer for English, for example, should identify the string "cats"
-- (and possibly "catlike", "catty" etc.) as based on the root "cat". -- (and possibly "catlike", "catty" etc.) as based on the root "cat".
-- and -- and
-- "stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming -- "stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming
-- algorithm reduces the words "fishing", "fished", and "fisher" to the -- algorithm reduces the words "fishing", "fished", and "fisher" to the
...@@ -48,7 +47,6 @@ import Gargantext.Core (Lang(..)) ...@@ -48,7 +47,6 @@ import Gargantext.Core (Lang(..))
-- "arguments" reduce to the stem "argument". -- "arguments" reduce to the stem "argument".
stem :: Lang -> Text -> Text stem :: Lang -> Text -> Text
stem lang = DT.pack . N.stem lang' . DT.unpack stem lang = DT.pack . N.stem lang' . DT.unpack
where where
......
...@@ -7,8 +7,8 @@ Maintainer : team@gargantext.org ...@@ -7,8 +7,8 @@ Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
Here is a longer description of this module, containing some CSV parser for Gargantext corpus files.
commentary with @some markup@.
-} -}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
...@@ -17,29 +17,123 @@ commentary with @some markup@. ...@@ -17,29 +17,123 @@ commentary with @some markup@.
module Gargantext.Text.Parsers.CSV where module Gargantext.Text.Parsers.CSV where
import GHC.Generics (Generic) import GHC.Real (round)
import GHC.IO (FilePath) import GHC.IO (FilePath)
import Data.Either (Either(Left, Right))
import Data.Text (Text)
import Control.Applicative import Control.Applicative
import qualified Data.ByteString.Lazy as BL
import Data.Char (ord)
import Data.Csv import Data.Csv
import Data.Either (Either(Left, Right))
import Data.List (concat)
import Data.String (IsString)
import Data.Text (Text, pack, unpack, length)
import qualified Data.ByteString.Lazy as BL
import Data.Vector (Vector)
import qualified Data.Vector as V import qualified Data.Vector as V
import Data.Text (pack) import Safe (tailMay)
import Text.HTML.TagSoup
import Data.Char (ord) import Gargantext.Text
import Gargantext.Prelude import Gargantext.Prelude hiding (length)
---------------------------------------------------------------
data Doc = Doc
{ d_docId :: !Int
, d_title :: !Text
, d_source :: !Text
, d_publication_year :: !Int
, d_publication_month :: !Int
, d_publication_day :: !Int
, d_abstract :: !Text
, d_authors :: !Text
}
deriving (Show)
---------------------------------------------------------------
toDocs :: Vector CsvDoc -> [Doc]
toDocs v = V.toList
$ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
-> Doc nId t s py pm pd abst auth )
(V.enumFromN 1 (V.length v')) v''
where
m = docsSize v
v' = V.concatMap (splitDoc m Paragraph) v
m' = docsSize v
v'' = V.concatMap (splitDoc m' Sentences) v'
m'' = docsSize v'
v''' = V.concatMap (splitDoc m' Sentences) v''
---------------------------------------------------------------
fromDocs :: Vector Doc -> Vector CsvDoc
fromDocs docs = V.map fromDocs' docs
where
fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
---------------------------------------------------------------
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
data SplitBy = Paragraph | Sentences | Chars
splitDoc :: Mean -> SplitBy -> CsvDoc -> Vector CsvDoc
splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
if docSize > 1000
then
if (mod (round m) docSize) >= 10
then
splitDoc' splt doc
else
V.fromList [doc]
else
V.fromList [doc]
splitDoc' :: SplitBy -> CsvDoc -> Vector CsvDoc
splitDoc' splt (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
where
firstDoc = CsvDoc t s py pm pd firstAbstract auth
firstAbstract = head' abstracts
nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
abstracts = (splitBy splt) abst
head' x = maybe "" identity (head x)
tail' x = maybe [""] identity (tailMay x)
splitBy :: SplitBy -> Text -> [Text]
splitBy Chars = map pack . chunkAlong 1000 1 . unpack
splitBy Sentences = map unsentences . chunkAlong 20 1 . sentences
splitBy Paragraph = map removeTag . filter isTagText . parseTags
where
removeTag :: IsString p => Tag p -> p
removeTag (TagText x) = x
removeTag (TagComment x) = x
removeTag _ = ""
---------------------------------------------------------------
---------------------------------------------------------------
type Mean = Double
docsSize :: Vector CsvDoc -> Mean
docsSize csvDoc = mean ls
where
ls = V.toList $ V.map (fromIntegral . length . c_abstract) csvDoc
---------------------------------------------------------------
data CsvDoc = CsvDoc data CsvDoc = CsvDoc
{ title :: !Text { c_title :: !Text
, source :: !Text , c_source :: !Text
, publication_year :: !Int , c_publication_year :: !Int
, publication_month :: !Int , c_publication_month :: !Int
, publication_day :: !Int , c_publication_day :: !Int
, abstract :: !Text , c_abstract :: !Text
, authors :: !Text , c_authors :: !Text
} }
deriving (Show, Generic) deriving (Show)
instance FromNamedRecord CsvDoc where instance FromNamedRecord CsvDoc where
parseNamedRecord r = CsvDoc <$> r .: "title" parseNamedRecord r = CsvDoc <$> r .: "title"
...@@ -63,20 +157,25 @@ instance ToNamedRecord CsvDoc where ...@@ -63,20 +157,25 @@ instance ToNamedRecord CsvDoc where
csvDecodeOptions :: DecodeOptions csvDecodeOptions :: DecodeOptions
csvDecodeOptions = (defaultDecodeOptions {decDelimiter = fromIntegral $ ord '\t'} ) csvDecodeOptions = (defaultDecodeOptions
{decDelimiter = fromIntegral $ ord '\t'}
)
csvEncodeOptions :: EncodeOptions csvEncodeOptions :: EncodeOptions
csvEncodeOptions = (defaultEncodeOptions {encDelimiter = fromIntegral $ ord '\t'} ) csvEncodeOptions = ( defaultEncodeOptions
{encDelimiter = fromIntegral $ ord '\t'}
)
readCsv :: FilePath -> IO (Header, V.Vector CsvDoc) readCsv :: FilePath -> IO (Header, Vector CsvDoc)
readCsv fp = do readCsv fp = do
csvData <- BL.readFile fp csvData <- BL.readFile fp
case decodeByNameWith csvDecodeOptions csvData of case decodeByNameWith csvDecodeOptions csvData of
Left e -> panic (pack e) Left e -> panic (pack e)
Right csvDocs -> pure csvDocs Right csvDocs -> pure csvDocs
writeCsv :: FilePath -> (Header, V.Vector CsvDoc) -> IO ()
writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
writeCsv fp (h, vs) = BL.writeFile fp $ writeCsv fp (h, vs) = BL.writeFile fp $
encodeByNameWith csvEncodeOptions h (V.toList vs) encodeByNameWith csvEncodeOptions h (V.toList vs)
{-|
Module : Gargantext.Text.Search
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This search Engine is first made to clean CSV file according to a query.
Starting from this model, a specific Gargantext engine will be made
(using more metrics scores/features).
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings, NamedFieldPuns #-}
module Gargantext.Text.Search where
import Data.SearchEngine
import Data.Ix
-- Usefull to use stopwords
-- import Data.Set (Set)
-- import qualified Data.Set as Set
import Data.Text (Text)
import Gargantext.Prelude
import Gargantext.Text.Ngrams
import Gargantext.Text.Ngrams.Stem as ST
import Gargantext.Text.Parsers.CSV
type DocId = Int
type DocSearchEngine = SearchEngine
Doc
DocId
DocField
NoFeatures
data DocField = TitleField
| AbstractField
deriving (Eq, Ord, Enum, Bounded, Ix, Show)
initialDocSearchEngine :: DocSearchEngine
initialDocSearchEngine =
initSearchEngine docSearchConfig defaultSearchRankParameters
docSearchConfig :: SearchConfig Doc DocId DocField NoFeatures
docSearchConfig =
SearchConfig {
documentKey = d_docId,
extractDocumentTerms = extractTokens,
transformQueryTerm = normaliseQueryToken,
documentFeatureValue = const noFeatures
}
where
extractTokens :: Doc -> DocField -> [Text]
extractTokens doc TitleField = monograms (d_title doc)
extractTokens doc AbstractField = monograms (d_abstract doc)
normaliseQueryToken :: Text -> DocField -> Text
normaliseQueryToken tok =
let tokStem = ST.stem ST.EN
in \field -> case field of
TitleField -> tokStem tok
AbstractField -> tokStem tok
defaultSearchRankParameters :: SearchRankParameters DocField NoFeatures
defaultSearchRankParameters =
SearchRankParameters {
paramK1,
paramB,
paramFieldWeights,
paramFeatureWeights = noFeatures,
paramFeatureFunctions = noFeatures,
paramResultsetSoftLimit = 2000,
paramResultsetHardLimit = 4000,
paramAutosuggestPrefilterLimit = 500,
paramAutosuggestPostfilterLimit = 500
}
where
paramK1 :: Float
paramK1 = 1.5
paramB :: DocField -> Float
paramB TitleField = 0.9
paramB AbstractField = 0.5
paramFieldWeights :: DocField -> Float
paramFieldWeights TitleField = 20
paramFieldWeights AbstractField = 5
...@@ -17,6 +17,7 @@ extra-deps: ...@@ -17,6 +17,7 @@ extra-deps:
- duckling-0.1.3.0 - duckling-0.1.3.0
- extra-1.5.3 - extra-1.5.3
- full-text-search-0.2.1.4 - full-text-search-0.2.1.4
- fullstop-0.1.4
- haskell-src-exts-1.18.2 - haskell-src-exts-1.18.2
- http-types-0.12.1 - http-types-0.12.1
- protolude-0.2 - protolude-0.2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment