[FEAT] CSV ngrams extraction, engine and search.

16327c34 · Alexandre Delanoë · 7997ab36 · 16327c34 · 16327c34 · 16327c34
Commit 16327c34 authored May 15, 2018 by Alexandre Delanoë
12 changed files
--- a/app/CleanCsvCorpus.hs
+++ b/app/CleanCsvCorpus.hs
+{-|
+Module      : CleanCsvCorpus.hs
+Description : Gargantext starter
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+Given a Gargantext CSV File and its Query
+This script cleans and compress the contexts around the main terms of the query.
+-}
+{-# LANGUAGE NoImplicitPrelude #-}
+module CleanCsvCorpus  where
+--import GHC.IO (FilePath)
+import Data.SearchEngine as S
+import qualified Data.Set as S
+import Data.Vector (Vector)
+import qualified Data.Vector as V
+import Gargantext.Prelude
+import Gargantext.Text.Search
+import Gargantext.Text.Parsers.CSV
+------------------------------------------------------------------------
+type Query = [S.Term]
+filterDocs :: [DocId] -> Vector Doc -> Vector Doc
+filterDocs docIds = V.filter (\doc -> S.member (d_docId doc) $ S.fromList docIds )
+main :: IO ()
+main = do
+  let rPath = "/tmp/Gargantext_Corpus.csv"
+  let wPath = "/tmp/Gargantext_Corpus_bis.csv"
+  --let q = ["water", "scarcity", "morocco", "shortage","flood"]
+  let q = ["gratuit", "gratuité", "culture", "culturel"]
+  (h,csvDocs) <- readCsv rPath
+  putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
+  putStrLn $ "Mean size of docs:" <> show ( docsSize csvDocs)
+  let docs   = toDocs csvDocs
+  let engine = insertDocs docs initialDocSearchEngine
+  let docIds = S.query engine q
+  let docs'  = fromDocs $ filterDocs docIds (V.fromList docs)
+  putStrLn $ "Number of documents after:" <> show (V.length docs')
+  putStrLn $ "Mean size of docs:" <> show (docsSize docs')
+  writeCsv wPath (h, docs')
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -6,9 +6,10 @@ License     : AGPL + CECILL v3
 Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX
-}
+Script to start gargantext with different modes (Dev, Prod, Mock).
+-}
 {-# LANGUAGE DataKinds         #-}
 {-# LANGUAGE DeriveGeneric     #-}
@@ -17,7 +18,6 @@ Portability : POSIX
 {-# LANGUAGE StandaloneDeriving #-}
 {-# LANGUAGE TypeOperators     #-}
 module Main where
 import Prelude (putStrLn)
@@ -47,10 +47,9 @@ deriving instance Show (MyOptions Unwrapped)
 main :: IO ()
 main = do 
    MyOptions myMode myPort myIniFile  <- unwrapRecord
            "Gargantext: collaborative platform for text-mining"
    let myPort' = case myPort of
            Just p  -> p
            Nothing -> 8008
@@ -64,6 +63,6 @@ main = do
                                       Just i  -> i
            Mock -> startGargantextMock myPort'
            _  -> startGargantextMock myPort'
    putStrLn $ "Starting Gargantext with mode: " <> show myMode
    start
--- a/package.yaml
+++ b/package.yaml
@@ -58,6 +58,7 @@ library:
  - duckling
  - exceptions
  - filepath
+  - fullstop
  - fclabels
  - fast-logger
  - full-text-search

--- a/src/Gargantext/API/Node.hs
+++ b/src/Gargantext/API/Node.hs
@@ -23,8 +23,6 @@ module Gargantext.API.Node
      where
 -------------------------------------------------------------------
-import System.IO (putStrLn)
 import Control.Monad.IO.Class (liftIO)
 import Control.Monad ((>>))
 --import System.IO (putStrLn, readFile)
@@ -98,13 +96,13 @@ type FacetDocAPI = "table"
 -- | Node API functions
 roots :: Connection -> Server Roots
-roots conn = liftIO (putStrLn "Log Needed" >> getNodesWithParentId conn 0 Nothing)
+roots conn = liftIO (putStrLn ( "Log Needed" :: Text) >> getNodesWithParentId conn 0 Nothing)
          :<|> pure (panic "not implemented yet")
          :<|> pure (panic "not implemented yet")
          :<|> pure (panic "not implemented yet")
 nodeAPI :: Connection -> NodeId -> Server NodeAPI
-nodeAPI conn id =  liftIO (putStrLn "getNode" >> getNode              conn id )
+nodeAPI conn id =  liftIO (putStrLn ("getNode" :: Text) >> getNode              conn id )
              :<|> deleteNode'   conn id
              :<|> getNodesWith' conn id
              :<|> getFacet      conn id

--- a/src/Gargantext/Database/Utils.hs
+++ b/src/Gargantext/Database/Utils.hs
@@ -33,7 +33,6 @@ import Database.PostgreSQL.Simple (Connection, connect)
 import Opaleye (Query, Unpackspec, showSqlForPostgres)
 import Data.Profunctor.Product.Default (Default)
 import Data.Maybe (maybe)
-import Prelude (id, putStrLn)
 -- TODO add a reader Monad here
 -- read this in the init file
@@ -61,5 +60,5 @@ connectGargandb fp = do
    connect parameters
 printSql :: Default Unpackspec a a => Query a -> IO ()
-printSql = putStrLn . maybe "Empty query" id . showSqlForPostgres
+printSql = putStrLn . maybe "Empty query" identity . showSqlForPostgres
--- a/src/Gargantext/Prelude.hs
+++ b/src/Gargantext/Prelude.hs
@@ -30,8 +30,10 @@ module Gargantext.Prelude
 import Data.Maybe (isJust, fromJust, maybe)
 import Protolude ( Bool(True, False), Int, Double, Integer
                 , Fractional, Num, Maybe(Just,Nothing)
+                 , Enum, Bounded, Float
                 , Floating, Char, IO
                 , pure, (<*>), (<$>), panic
+                 , putStrLn
                 , head, flip
                 , Ord, Integral, Foldable, RealFrac, Monad, filter
                 , reverse, map, zip, drop, take, zipWith

--- a/src/Gargantext/Text.hs
+++ b/src/Gargantext/Text.hs
@@ -32,6 +32,7 @@ import Data.Map.Strict  (Map
 import Data.Text (Text, split)
 import qualified Data.Map.Strict as M (filter)
+import NLP.FullStop (segment)
 -----------------------------------------------------------------
 import Gargantext.Text.Ngrams
 import Gargantext.Text.Metrics.Occurrences
@@ -89,11 +90,16 @@ text2fis n xs = list2fis n (map ngrams xs)
 -------------------------------------------------------------------
 -- Contexts of text
 sentences :: Text -> [Text]
-sentences txt = split isStop txt
+sentences txt = map DT.pack $ segment $ DT.unpack txt
+sentences' :: Text -> [Text]
+sentences' txt = split isStop txt
 isStop :: Char -> Bool
 isStop c = c `elem` ['.','?','!']
+unsentences :: [Text] -> Text
+unsentences txts = DT.intercalate " " txts
 -- | https://en.wikipedia.org/wiki/Text_mining
 testText_en :: Text

--- a/src/Gargantext/Text/Ngrams.hs
+++ b/src/Gargantext/Text/Ngrams.hs
@@ -26,7 +26,7 @@ module Gargantext.Text.Ngrams
  where
 import Data.Char (Char, isAlphaNum, isSpace)
-import Data.Text (Text, split, splitOn, pack)
+import Data.Text (Text, split, splitOn, pack, toLower)
 import Data.Set  (Set)
 import qualified Data.Set as S
@@ -64,7 +64,7 @@ equivNgrams (Ngrams _ s1) (Ngrams _ s2) = s1 `S.isSubsetOf` s2
 --monograms xs = monograms $ toLower $ filter isGram xs
 monograms :: Text -> [Text]
-monograms txt = split isWord txt
+monograms txt = map toLower $ split isWord txt
  where
    isWord c = c `elem` [' ', '\'', ',', ';']

--- a/src/Gargantext/Text/Ngrams/Stem.hs
+++ b/src/Gargantext/Text/Ngrams/Stem.hs
@@ -18,7 +18,7 @@ Source : https://en.wikipedia.org/wiki/Stemming
 -}
-module Gargantext.Text.Ngrams.Stem
+module Gargantext.Text.Ngrams.Stem (stem, Lang(..))
  where
 import Data.Text (Text)
@@ -38,7 +38,6 @@ import Gargantext.Core (Lang(..))
 -- A stemmer for English, for example, should identify the string "cats"
 -- (and possibly "catlike", "catty" etc.) as based on the root "cat".
 -- and
 -- "stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming
 -- algorithm reduces the words "fishing", "fished", and "fisher" to the
@@ -48,7 +47,6 @@ import Gargantext.Core (Lang(..))
 -- "arguments" reduce to the stem "argument".
 stem :: Lang -> Text -> Text
 stem lang = DT.pack . N.stem lang' . DT.unpack
  where

--- a/src/Gargantext/Text/Parsers/CSV.hs
+++ b/src/Gargantext/Text/Parsers/CSV.hs
@@ -7,8 +7,8 @@ Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX
-Here is a longer description of this module, containing some
+CSV parser for Gargantext corpus files.
-commentary with @some markup@.
 -}
 {-# LANGUAGE NoImplicitPrelude #-}
@@ -17,29 +17,123 @@ commentary with @some markup@.
 module Gargantext.Text.Parsers.CSV where
-import GHC.Generics (Generic)
+import GHC.Real (round)
 import GHC.IO (FilePath)
-import Data.Either (Either(Left, Right))
-import Data.Text (Text)
 import Control.Applicative
-import qualified Data.ByteString.Lazy as BL
+import Data.Char (ord)
 import Data.Csv
+import Data.Either (Either(Left, Right))
+import Data.List (concat)
+import Data.String (IsString)
+import Data.Text (Text, pack, unpack, length)
+import qualified Data.ByteString.Lazy as BL
+import Data.Vector (Vector)
 import qualified Data.Vector as V
-import Data.Text (pack)
+import Safe (tailMay)
+import Text.HTML.TagSoup
-import Data.Char (ord)
+import Gargantext.Text
-import Gargantext.Prelude
+import Gargantext.Prelude hiding (length)
+---------------------------------------------------------------
+data Doc = Doc
+    { d_docId  :: !Int
+    , d_title  :: !Text
+    , d_source :: !Text
+    , d_publication_year  :: !Int
+    , d_publication_month :: !Int
+    , d_publication_day   :: !Int
+    , d_abstract          :: !Text
+    , d_authors           :: !Text
+    }
+    deriving (Show)
+---------------------------------------------------------------
+toDocs :: Vector CsvDoc -> [Doc]
+toDocs v = V.toList 
+         $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
+                       -> Doc nId t s py pm pd abst auth )
+                       (V.enumFromN 1 (V.length v')) v''
+      where
+        m  = docsSize v
+        v' = V.concatMap (splitDoc m Paragraph) v
+        m' = docsSize v
+        v'' = V.concatMap (splitDoc m' Sentences) v'
+        m'' = docsSize v'
+        v''' = V.concatMap (splitDoc m' Sentences) v''
+---------------------------------------------------------------
+fromDocs :: Vector Doc -> Vector CsvDoc
+fromDocs docs = V.map fromDocs' docs
+  where
+    fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
+---------------------------------------------------------------
+-- | Split a document in its context
+-- TODO adapt the size of the paragraph according to the corpus average
+data SplitBy = Paragraph | Sentences | Chars
+splitDoc :: Mean -> SplitBy -> CsvDoc -> Vector CsvDoc
+splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
+                          if docSize > 1000
+                            then
+                              if (mod (round m) docSize) >= 10
+                                then
+                                  splitDoc' splt doc
+                                else
+                                  V.fromList [doc]
+                            else
+                              V.fromList [doc]
+splitDoc' :: SplitBy -> CsvDoc -> Vector CsvDoc
+splitDoc' splt (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
+    where
+      firstDoc = CsvDoc t s py pm pd firstAbstract auth
+      firstAbstract = head' abstracts
+      nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
+      abstracts    = (splitBy splt) abst
+      head' x = maybe ""   identity (head x)
+      tail' x = maybe [""] identity (tailMay x)
+splitBy :: SplitBy -> Text -> [Text]
+splitBy Chars     = map pack . chunkAlong 1000 1 . unpack
+splitBy Sentences = map unsentences . chunkAlong 20 1  . sentences
+splitBy Paragraph = map removeTag   . filter isTagText . parseTags
+  where
+    removeTag :: IsString p => Tag p -> p
+    removeTag (TagText x) = x
+    removeTag (TagComment x) = x
+    removeTag _          = ""
+---------------------------------------------------------------
+---------------------------------------------------------------
+type Mean = Double
+docsSize :: Vector CsvDoc -> Mean
+docsSize csvDoc = mean ls
+  where
+    ls = V.toList $ V.map (fromIntegral . length . c_abstract) csvDoc
+---------------------------------------------------------------
 data CsvDoc = CsvDoc
-    { title  :: !Text
+    { c_title  :: !Text
-    , source :: !Text
+    , c_source :: !Text
-    , publication_year  :: !Int
+    , c_publication_year  :: !Int
-    , publication_month :: !Int
+    , c_publication_month :: !Int
-    , publication_day   :: !Int
+    , c_publication_day   :: !Int
-    , abstract          :: !Text
+    , c_abstract          :: !Text
-    , authors           :: !Text
+    , c_authors           :: !Text
    }
-    deriving (Show, Generic)
+    deriving (Show)
 instance FromNamedRecord CsvDoc where
  parseNamedRecord r = CsvDoc <$> r .: "title"
@@ -63,20 +157,25 @@ instance ToNamedRecord CsvDoc where
 csvDecodeOptions :: DecodeOptions
-csvDecodeOptions = (defaultDecodeOptions {decDelimiter = fromIntegral $ ord '\t'} )
+csvDecodeOptions = (defaultDecodeOptions
+                      {decDelimiter = fromIntegral $ ord '\t'}
+                    )
 csvEncodeOptions :: EncodeOptions
-csvEncodeOptions = (defaultEncodeOptions {encDelimiter = fromIntegral $ ord '\t'} )
+csvEncodeOptions = ( defaultEncodeOptions 
+                      {encDelimiter = fromIntegral $ ord '\t'}
+                    )
-readCsv :: FilePath -> IO (Header, V.Vector CsvDoc)
+readCsv :: FilePath -> IO (Header, Vector CsvDoc)
 readCsv fp = do
    csvData <- BL.readFile fp
    case decodeByNameWith csvDecodeOptions csvData of
      Left e    -> panic (pack e)
      Right csvDocs -> pure csvDocs
-writeCsv :: FilePath -> (Header, V.Vector CsvDoc) -> IO ()
+writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 writeCsv fp (h, vs) = BL.writeFile fp $
                      encodeByNameWith csvEncodeOptions h (V.toList vs)
--- a/src/Gargantext/Text/Search.hs
+++ b/src/Gargantext/Text/Search.hs
+{-|
+Module      : Gargantext.Text.Search
+Description : All parsers of Gargantext in one file.
+Copyright   : (c) CNRS, 2017 - present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+This search Engine is first made to clean CSV file according to a query.
+Starting from this model, a specific Gargantext engine will be made
+(using more metrics scores/features).
+-}
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE OverloadedStrings, NamedFieldPuns #-}
+module Gargantext.Text.Search where
+import Data.SearchEngine
+import Data.Ix
+-- Usefull to use stopwords
+-- import Data.Set (Set)
+-- import qualified Data.Set as Set
+import Data.Text (Text)
+import Gargantext.Prelude
+import Gargantext.Text.Ngrams
+import Gargantext.Text.Ngrams.Stem as ST
+import Gargantext.Text.Parsers.CSV
+type DocId = Int
+type DocSearchEngine = SearchEngine
+                         Doc
+                         DocId
+                         DocField
+                         NoFeatures
+data DocField = TitleField
+              | AbstractField
+  deriving (Eq, Ord, Enum, Bounded, Ix, Show)
+initialDocSearchEngine :: DocSearchEngine
+initialDocSearchEngine =
+    initSearchEngine docSearchConfig defaultSearchRankParameters
+docSearchConfig :: SearchConfig Doc DocId DocField NoFeatures
+docSearchConfig =
+    SearchConfig {
+      documentKey           = d_docId,
+      extractDocumentTerms  = extractTokens,
+      transformQueryTerm    = normaliseQueryToken,
+      documentFeatureValue  = const noFeatures
+  }
+  where
+    extractTokens :: Doc -> DocField -> [Text]
+    extractTokens doc TitleField       = monograms (d_title doc)
+    extractTokens doc AbstractField    = monograms (d_abstract doc)
+    normaliseQueryToken :: Text -> DocField -> Text
+    normaliseQueryToken tok =
+      let tokStem = ST.stem ST.EN
+       in \field -> case field of
+                      TitleField    -> tokStem tok
+                      AbstractField -> tokStem tok
+defaultSearchRankParameters :: SearchRankParameters DocField NoFeatures
+defaultSearchRankParameters =
+    SearchRankParameters {
+      paramK1,
+      paramB,
+      paramFieldWeights,
+      paramFeatureWeights     = noFeatures,
+      paramFeatureFunctions   = noFeatures,
+      paramResultsetSoftLimit = 2000,
+      paramResultsetHardLimit = 4000,
+      paramAutosuggestPrefilterLimit  = 500,
+      paramAutosuggestPostfilterLimit = 500
+    }
+  where
+    paramK1 :: Float
+    paramK1 = 1.5
+    paramB :: DocField -> Float
+    paramB TitleField      = 0.9
+    paramB AbstractField   = 0.5
+    paramFieldWeights :: DocField -> Float
+    paramFieldWeights TitleField    = 20
+    paramFieldWeights AbstractField = 5
--- a/stack.yaml
+++ b/stack.yaml
@@ -17,6 +17,7 @@ extra-deps:
 - duckling-0.1.3.0
 - extra-1.5.3
 - full-text-search-0.2.1.4
+- fullstop-0.1.4
 - haskell-src-exts-1.18.2
 - http-types-0.12.1
 - protolude-0.2