Commit 16327c34 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] CSV ngrams extraction, engine and search.

parent 7997ab36
{-|
Module : CleanCsvCorpus.hs
Description : Gargantext starter
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Given a Gargantext CSV File and its Query
This script cleans and compress the contexts around the main terms of the query.
-}
{-# LANGUAGE NoImplicitPrelude #-}
module CleanCsvCorpus where
--import GHC.IO (FilePath)
import Data.SearchEngine as S
import qualified Data.Set as S
import Data.Vector (Vector)
import qualified Data.Vector as V
import Gargantext.Prelude
import Gargantext.Text.Search
import Gargantext.Text.Parsers.CSV
------------------------------------------------------------------------
type Query = [S.Term]
filterDocs :: [DocId] -> Vector Doc -> Vector Doc
filterDocs docIds = V.filter (\doc -> S.member (d_docId doc) $ S.fromList docIds )
main :: IO ()
main = do
let rPath = "/tmp/Gargantext_Corpus.csv"
let wPath = "/tmp/Gargantext_Corpus_bis.csv"
--let q = ["water", "scarcity", "morocco", "shortage","flood"]
let q = ["gratuit", "gratuité", "culture", "culturel"]
(h,csvDocs) <- readCsv rPath
putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
putStrLn $ "Mean size of docs:" <> show ( docsSize csvDocs)
let docs = toDocs csvDocs
let engine = insertDocs docs initialDocSearchEngine
let docIds = S.query engine q
let docs' = fromDocs $ filterDocs docIds (V.fromList docs)
putStrLn $ "Number of documents after:" <> show (V.length docs')
putStrLn $ "Mean size of docs:" <> show (docsSize docs')
writeCsv wPath (h, docs')
......@@ -6,9 +6,10 @@ License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
Script to start gargantext with different modes (Dev, Prod, Mock).
-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
......@@ -17,7 +18,6 @@ Portability : POSIX
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeOperators #-}
module Main where
import Prelude (putStrLn)
......@@ -47,10 +47,9 @@ deriving instance Show (MyOptions Unwrapped)
main :: IO ()
main = do
MyOptions myMode myPort myIniFile <- unwrapRecord
"Gargantext: collaborative platform for text-mining"
let myPort' = case myPort of
Just p -> p
Nothing -> 8008
......@@ -64,6 +63,6 @@ main = do
Just i -> i
Mock -> startGargantextMock myPort'
_ -> startGargantextMock myPort'
putStrLn $ "Starting Gargantext with mode: " <> show myMode
start
......@@ -58,6 +58,7 @@ library:
- duckling
- exceptions
- filepath
- fullstop
- fclabels
- fast-logger
- full-text-search
......
......@@ -23,8 +23,6 @@ module Gargantext.API.Node
where
-------------------------------------------------------------------
import System.IO (putStrLn)
import Control.Monad.IO.Class (liftIO)
import Control.Monad ((>>))
--import System.IO (putStrLn, readFile)
......@@ -98,13 +96,13 @@ type FacetDocAPI = "table"
-- | Node API functions
roots :: Connection -> Server Roots
roots conn = liftIO (putStrLn "Log Needed" >> getNodesWithParentId conn 0 Nothing)
roots conn = liftIO (putStrLn ( "Log Needed" :: Text) >> getNodesWithParentId conn 0 Nothing)
:<|> pure (panic "not implemented yet")
:<|> pure (panic "not implemented yet")
:<|> pure (panic "not implemented yet")
nodeAPI :: Connection -> NodeId -> Server NodeAPI
nodeAPI conn id = liftIO (putStrLn "getNode" >> getNode conn id )
nodeAPI conn id = liftIO (putStrLn ("getNode" :: Text) >> getNode conn id )
:<|> deleteNode' conn id
:<|> getNodesWith' conn id
:<|> getFacet conn id
......
......@@ -33,7 +33,6 @@ import Database.PostgreSQL.Simple (Connection, connect)
import Opaleye (Query, Unpackspec, showSqlForPostgres)
import Data.Profunctor.Product.Default (Default)
import Data.Maybe (maybe)
import Prelude (id, putStrLn)
-- TODO add a reader Monad here
-- read this in the init file
......@@ -61,5 +60,5 @@ connectGargandb fp = do
connect parameters
printSql :: Default Unpackspec a a => Query a -> IO ()
printSql = putStrLn . maybe "Empty query" id . showSqlForPostgres
printSql = putStrLn . maybe "Empty query" identity . showSqlForPostgres
......@@ -30,8 +30,10 @@ module Gargantext.Prelude
import Data.Maybe (isJust, fromJust, maybe)
import Protolude ( Bool(True, False), Int, Double, Integer
, Fractional, Num, Maybe(Just,Nothing)
, Enum, Bounded, Float
, Floating, Char, IO
, pure, (<*>), (<$>), panic
, putStrLn
, head, flip
, Ord, Integral, Foldable, RealFrac, Monad, filter
, reverse, map, zip, drop, take, zipWith
......
......@@ -32,6 +32,7 @@ import Data.Map.Strict (Map
import Data.Text (Text, split)
import qualified Data.Map.Strict as M (filter)
import NLP.FullStop (segment)
-----------------------------------------------------------------
import Gargantext.Text.Ngrams
import Gargantext.Text.Metrics.Occurrences
......@@ -89,11 +90,16 @@ text2fis n xs = list2fis n (map ngrams xs)
-------------------------------------------------------------------
-- Contexts of text
sentences :: Text -> [Text]
sentences txt = split isStop txt
sentences txt = map DT.pack $ segment $ DT.unpack txt
sentences' :: Text -> [Text]
sentences' txt = split isStop txt
isStop :: Char -> Bool
isStop c = c `elem` ['.','?','!']
unsentences :: [Text] -> Text
unsentences txts = DT.intercalate " " txts
-- | https://en.wikipedia.org/wiki/Text_mining
testText_en :: Text
......
......@@ -26,7 +26,7 @@ module Gargantext.Text.Ngrams
where
import Data.Char (Char, isAlphaNum, isSpace)
import Data.Text (Text, split, splitOn, pack)
import Data.Text (Text, split, splitOn, pack, toLower)
import Data.Set (Set)
import qualified Data.Set as S
......@@ -64,7 +64,7 @@ equivNgrams (Ngrams _ s1) (Ngrams _ s2) = s1 `S.isSubsetOf` s2
--monograms xs = monograms $ toLower $ filter isGram xs
monograms :: Text -> [Text]
monograms txt = split isWord txt
monograms txt = map toLower $ split isWord txt
where
isWord c = c `elem` [' ', '\'', ',', ';']
......
......@@ -18,7 +18,7 @@ Source : https://en.wikipedia.org/wiki/Stemming
-}
module Gargantext.Text.Ngrams.Stem
module Gargantext.Text.Ngrams.Stem (stem, Lang(..))
where
import Data.Text (Text)
......@@ -38,7 +38,6 @@ import Gargantext.Core (Lang(..))
-- A stemmer for English, for example, should identify the string "cats"
-- (and possibly "catlike", "catty" etc.) as based on the root "cat".
-- and
-- "stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming
-- algorithm reduces the words "fishing", "fished", and "fisher" to the
......@@ -48,7 +47,6 @@ import Gargantext.Core (Lang(..))
-- "arguments" reduce to the stem "argument".
stem :: Lang -> Text -> Text
stem lang = DT.pack . N.stem lang' . DT.unpack
where
......
......@@ -7,8 +7,8 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
CSV parser for Gargantext corpus files.
-}
{-# LANGUAGE NoImplicitPrelude #-}
......@@ -17,29 +17,123 @@ commentary with @some markup@.
module Gargantext.Text.Parsers.CSV where
import GHC.Generics (Generic)
import GHC.Real (round)
import GHC.IO (FilePath)
import Data.Either (Either(Left, Right))
import Data.Text (Text)
import Control.Applicative
import qualified Data.ByteString.Lazy as BL
import Data.Char (ord)
import Data.Csv
import Data.Either (Either(Left, Right))
import Data.List (concat)
import Data.String (IsString)
import Data.Text (Text, pack, unpack, length)
import qualified Data.ByteString.Lazy as BL
import Data.Vector (Vector)
import qualified Data.Vector as V
import Data.Text (pack)
import Safe (tailMay)
import Text.HTML.TagSoup
import Data.Char (ord)
import Gargantext.Prelude
import Gargantext.Text
import Gargantext.Prelude hiding (length)
---------------------------------------------------------------
data Doc = Doc
{ d_docId :: !Int
, d_title :: !Text
, d_source :: !Text
, d_publication_year :: !Int
, d_publication_month :: !Int
, d_publication_day :: !Int
, d_abstract :: !Text
, d_authors :: !Text
}
deriving (Show)
---------------------------------------------------------------
toDocs :: Vector CsvDoc -> [Doc]
toDocs v = V.toList
$ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
-> Doc nId t s py pm pd abst auth )
(V.enumFromN 1 (V.length v')) v''
where
m = docsSize v
v' = V.concatMap (splitDoc m Paragraph) v
m' = docsSize v
v'' = V.concatMap (splitDoc m' Sentences) v'
m'' = docsSize v'
v''' = V.concatMap (splitDoc m' Sentences) v''
---------------------------------------------------------------
fromDocs :: Vector Doc -> Vector CsvDoc
fromDocs docs = V.map fromDocs' docs
where
fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
---------------------------------------------------------------
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
data SplitBy = Paragraph | Sentences | Chars
splitDoc :: Mean -> SplitBy -> CsvDoc -> Vector CsvDoc
splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
if docSize > 1000
then
if (mod (round m) docSize) >= 10
then
splitDoc' splt doc
else
V.fromList [doc]
else
V.fromList [doc]
splitDoc' :: SplitBy -> CsvDoc -> Vector CsvDoc
splitDoc' splt (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
where
firstDoc = CsvDoc t s py pm pd firstAbstract auth
firstAbstract = head' abstracts
nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
abstracts = (splitBy splt) abst
head' x = maybe "" identity (head x)
tail' x = maybe [""] identity (tailMay x)
splitBy :: SplitBy -> Text -> [Text]
splitBy Chars = map pack . chunkAlong 1000 1 . unpack
splitBy Sentences = map unsentences . chunkAlong 20 1 . sentences
splitBy Paragraph = map removeTag . filter isTagText . parseTags
where
removeTag :: IsString p => Tag p -> p
removeTag (TagText x) = x
removeTag (TagComment x) = x
removeTag _ = ""
---------------------------------------------------------------
---------------------------------------------------------------
type Mean = Double
docsSize :: Vector CsvDoc -> Mean
docsSize csvDoc = mean ls
where
ls = V.toList $ V.map (fromIntegral . length . c_abstract) csvDoc
---------------------------------------------------------------
data CsvDoc = CsvDoc
{ title :: !Text
, source :: !Text
, publication_year :: !Int
, publication_month :: !Int
, publication_day :: !Int
, abstract :: !Text
, authors :: !Text
{ c_title :: !Text
, c_source :: !Text
, c_publication_year :: !Int
, c_publication_month :: !Int
, c_publication_day :: !Int
, c_abstract :: !Text
, c_authors :: !Text
}
deriving (Show, Generic)
deriving (Show)
instance FromNamedRecord CsvDoc where
parseNamedRecord r = CsvDoc <$> r .: "title"
......@@ -63,20 +157,25 @@ instance ToNamedRecord CsvDoc where
csvDecodeOptions :: DecodeOptions
csvDecodeOptions = (defaultDecodeOptions {decDelimiter = fromIntegral $ ord '\t'} )
csvDecodeOptions = (defaultDecodeOptions
{decDelimiter = fromIntegral $ ord '\t'}
)
csvEncodeOptions :: EncodeOptions
csvEncodeOptions = (defaultEncodeOptions {encDelimiter = fromIntegral $ ord '\t'} )
csvEncodeOptions = ( defaultEncodeOptions
{encDelimiter = fromIntegral $ ord '\t'}
)
readCsv :: FilePath -> IO (Header, V.Vector CsvDoc)
readCsv :: FilePath -> IO (Header, Vector CsvDoc)
readCsv fp = do
csvData <- BL.readFile fp
case decodeByNameWith csvDecodeOptions csvData of
Left e -> panic (pack e)
Right csvDocs -> pure csvDocs
writeCsv :: FilePath -> (Header, V.Vector CsvDoc) -> IO ()
writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
writeCsv fp (h, vs) = BL.writeFile fp $
encodeByNameWith csvEncodeOptions h (V.toList vs)
{-|
Module : Gargantext.Text.Search
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This search Engine is first made to clean CSV file according to a query.
Starting from this model, a specific Gargantext engine will be made
(using more metrics scores/features).
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings, NamedFieldPuns #-}
module Gargantext.Text.Search where
import Data.SearchEngine
import Data.Ix
-- Usefull to use stopwords
-- import Data.Set (Set)
-- import qualified Data.Set as Set
import Data.Text (Text)
import Gargantext.Prelude
import Gargantext.Text.Ngrams
import Gargantext.Text.Ngrams.Stem as ST
import Gargantext.Text.Parsers.CSV
type DocId = Int
type DocSearchEngine = SearchEngine
Doc
DocId
DocField
NoFeatures
data DocField = TitleField
| AbstractField
deriving (Eq, Ord, Enum, Bounded, Ix, Show)
initialDocSearchEngine :: DocSearchEngine
initialDocSearchEngine =
initSearchEngine docSearchConfig defaultSearchRankParameters
docSearchConfig :: SearchConfig Doc DocId DocField NoFeatures
docSearchConfig =
SearchConfig {
documentKey = d_docId,
extractDocumentTerms = extractTokens,
transformQueryTerm = normaliseQueryToken,
documentFeatureValue = const noFeatures
}
where
extractTokens :: Doc -> DocField -> [Text]
extractTokens doc TitleField = monograms (d_title doc)
extractTokens doc AbstractField = monograms (d_abstract doc)
normaliseQueryToken :: Text -> DocField -> Text
normaliseQueryToken tok =
let tokStem = ST.stem ST.EN
in \field -> case field of
TitleField -> tokStem tok
AbstractField -> tokStem tok
defaultSearchRankParameters :: SearchRankParameters DocField NoFeatures
defaultSearchRankParameters =
SearchRankParameters {
paramK1,
paramB,
paramFieldWeights,
paramFeatureWeights = noFeatures,
paramFeatureFunctions = noFeatures,
paramResultsetSoftLimit = 2000,
paramResultsetHardLimit = 4000,
paramAutosuggestPrefilterLimit = 500,
paramAutosuggestPostfilterLimit = 500
}
where
paramK1 :: Float
paramK1 = 1.5
paramB :: DocField -> Float
paramB TitleField = 0.9
paramB AbstractField = 0.5
paramFieldWeights :: DocField -> Float
paramFieldWeights TitleField = 20
paramFieldWeights AbstractField = 5
......@@ -17,6 +17,7 @@ extra-deps:
- duckling-0.1.3.0
- extra-1.5.3
- full-text-search-0.2.1.4
- fullstop-0.1.4
- haskell-src-exts-1.18.2
- http-types-0.12.1
- protolude-0.2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment