Commit 0f0feaac authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Ngrams metrics, thanks to text-metrics to begin with.

parent e6a1adfe
......@@ -27,10 +27,12 @@ library
, conduit
, conduit-extra
, directory
, duckling
, extra
, filepath
, http-conduit
, lens
, logging-effect
, opaleye
, path
, parsec
......@@ -49,9 +51,12 @@ library
, split
, tagsoup
, text
, text-metrics
, time
, timezone-series
, time-locale-compat
, transformers
, unordered-containers
, uuid
, vector
, wai
......@@ -82,10 +87,12 @@ library
, Data.Gargantext.Ngrams.Parser
, Data.Gargantext.Ngrams.Lang.En
, Data.Gargantext.Ngrams.Lang.Fr
, Data.Gargantext.Ngrams.Metrics
, Data.Gargantext.Ngrams.TextMining
, Data.Gargantext.Ngrams.Occurrences
, Data.Gargantext.Parsers
, Data.Gargantext.Parsers.WOS
, Data.Gargantext.Parsers.Date
, Data.Gargantext.Prelude
, Data.Gargantext.Server
, Data.Gargantext.Types
......@@ -117,6 +124,7 @@ test-suite garg-test-ngrams
Ngrams.Lang.Fr
Ngrams.Lang.En
Ngrams.Lang.Occurrences
Ngrams.Metrics
build-depends: base
, extra
, text
......
module Data.Gargantext.Ngrams (
module Data.Gargantext.Ngrams.Count,
--module Data.Gargantext.Ngrams.Hetero,
module Data.Gargantext.Ngrams.CoreNLP,
module Data.Gargantext.Ngrams.Parser,
module Data.Gargantext.Ngrams.Occurrences,
module Data.Gargantext.Ngrams.TextMining
--module Data.Gargantext.Ngrams.Words
module Data.Gargantext.Ngrams ( module Data.Gargantext.Ngrams.Count
--, module Data.Gargantext.Ngrams.Hetero
, module Data.Gargantext.Ngrams.CoreNLP
, module Data.Gargantext.Ngrams.Parser
, module Data.Gargantext.Ngrams.Occurrences
, module Data.Gargantext.Ngrams.TextMining
, module Data.Gargantext.Ngrams.Metrics
--, module Data.Gargantext.Ngrams.Words
) where
import Data.Gargantext.Ngrams.Count
......@@ -17,3 +17,5 @@ import Data.Gargantext.Ngrams.Parser
import Data.Gargantext.Ngrams.Occurrences
import Data.Gargantext.Ngrams.TextMining
--import Data.Gargantext.Ngrams.Words
import Data.Gargantext.Ngrams.Metrics
module Data.Gargantext.Ngrams.Metrics (levenshtein
, levenshteinNorm
, damerauLevenshtein
, damerauLevenshteinNorm
, overlap
, jaccard
, hamming
) where
import Data.Text (Text)
import GHC.Real (Ratio)
import qualified Data.Text.Metrics as DTM
-- | This module provide metrics to compare Text
-- starting as an API rexporting main functions of the great lib
-- text-metrics of Mark Karpov
-- | Levenshtein Distance
-- In information theory, Linguistics and computer science,
-- the Levenshtein distance is a string metric for measuring
-- the difference between two sequences.
-- See: https://en.wikipedia.org/wiki/Levenshtein_distance
levenshtein :: Text -> Text -> Int
levenshtein = DTM.levenshtein
-- | Return normalized Levenshtein distance between two 'Text' values.
-- Result is a non-negative rational number (represented as @'Ratio'
-- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
-- strings, while 1 means exact match.
levenshteinNorm :: Text -> Text -> Ratio Int
levenshteinNorm = DTM.levenshteinNorm
-- | Return Damerau-Levenshtein distance between two 'Text' values. The
-- function works like 'levenshtein', but the collection of allowed
-- operations also includes transposition of two /adjacent/ characters.
-- See also:
-- <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>
damerauLevenshtein :: Text -> Text -> Int
damerauLevenshtein = DTM.damerauLevenshtein
-- damerau-Levenshtein distance normalized
damerauLevenshteinNorm :: Text -> Text -> Ratio Int
damerauLevenshteinNorm = DTM.damerauLevenshteinNorm
-- Treating inputs like sets
-- | Return overlap coefficient for two 'Text' values. Returned value
-- is in the range from 0 (no similarity) to 1 (exact match). Return 1
-- if both 'Text' values are empty.
--
-- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
overlap :: Text -> Text -> Ratio Int
overlap = DTM.overlap
-- | Jaccard distance
-- measures dissimilarity between sample sets
jaccard :: Text -> Text -> Ratio Int
jaccard = DTM.jaccard
-- | Hamming Distance
-- In information theory, the Hamming distance between two strings of
-- equal length is the number of positions at which the corresponding
-- symbols are different. In other words, it measures the minimum number of
-- substitutions required to change one string into the other
-- See: https://en.wikipedia.org/wiki/Hamming_distance
hamming :: Text -> Text -> Maybe Int
hamming = DTM.hamming
......@@ -4,5 +4,7 @@ packages:
- .
- /home/alexandre/local/logiciels/haskell/servant/servant-multipart
extra-deps:
- utc-0.2.0.1
- aeson-1.0.2.1
- duckling-0.1.3.0
- protolude-0.2
resolver: lts-9.2
......@@ -2,10 +2,11 @@ import Data.Gargantext.Types.Main (Language(..))
--import qualified Ngrams.Lang.Fr as Fr
import qualified Ngrams.Lang as Lang
import qualified Ngrams.Lang.Occurrences as Occ
import qualified Ngrams.Metrics as Metrics
main :: IO ()
main = do
Occ.parsersTest
Lang.ngramsExtractionTest EN
Metrics.main
--Lang.ngramsExtractionTest FR
{-# LANGUAGE CPP #-}
{-# LANGUAGE OverloadedStrings #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Ngrams.Metrics (main) where
import Data.Ratio
import Data.Text (Text)
import Data.Gargantext.Ngrams.Metrics
import Test.Hspec
import Test.QuickCheck
import qualified Data.Text as T
#if !MIN_VERSION_base(4,8,0)
import Control.Applicative
#endif
instance Arbitrary Text where
arbitrary = T.pack <$> arbitrary
main :: IO ()
main = hspec spec
spec :: Spec
spec = do
describe "levenshtein" $ do
testSwap levenshtein
context "with concrete examples" $ do
testPair levenshtein "kitten" "sitting" 3
testPair levenshtein "cake" "drake" 2
testPair levenshtein "saturday" "sunday" 3
testPair levenshtein "red" "wax" 3
#if __GLASGOW_HASKELL__ >= 710
testPair levenshtein "a😀c" "abc" 1
#endif
testPair levenshtein "lucky" "lucky" 0
testPair levenshtein "" "" 0
describe "levenshteinNorm" $ do
testSwap levenshteinNorm
testPair levenshteinNorm "kitten" "sitting" (4 % 7)
testPair levenshteinNorm "cake" "drake" (3 % 5)
testPair levenshteinNorm "saturday" "sunday" (5 % 8)
testPair levenshteinNorm "red" "wax" (0 % 1)
#if __GLASGOW_HASKELL__ >= 710
testPair levenshteinNorm "a😀c" "abc" (2 % 3)
#endif
testPair levenshteinNorm "lucky" "lucky" (1 % 1)
testPair levenshteinNorm "" "" (1 % 1)
describe "damerauLevenshtein" $ do
testSwap damerauLevenshtein
testPair damerauLevenshtein "veryvery long" "very long" 4
testPair damerauLevenshtein "thing" "think" 1
testPair damerauLevenshtein "nose" "ones" 2
testPair damerauLevenshtein "thing" "sign" 3
testPair damerauLevenshtein "red" "wax" 3
#if __GLASGOW_HASKELL__ >= 710
testPair damerauLevenshtein "a😀c" "abc" 1
#endif
testPair damerauLevenshtein "lucky" "lucky" 0
testPair damerauLevenshtein "" "" 0
describe "damerauLevenshteinNorm" $ do
testSwap damerauLevenshteinNorm
testPair damerauLevenshteinNorm "veryvery long" "very long" (9 % 13)
testPair damerauLevenshteinNorm "thing" "think" (4 % 5)
testPair damerauLevenshteinNorm "nose" "ones" (1 % 2)
testPair damerauLevenshteinNorm "thing" "sign" (2 % 5)
testPair damerauLevenshteinNorm "red" "wax" (0 % 1)
#if __GLASGOW_HASKELL__ >= 710
testPair damerauLevenshteinNorm "a😀c" "abc" (2 % 3)
#endif
testPair damerauLevenshteinNorm "lucky" "lucky" (1 % 1)
testPair damerauLevenshteinNorm "" "" (1 % 1)
describe "hamming" $ do
testSwap hamming
testPair hamming "karolin" "kathrin" (Just 3)
testPair hamming "karolin" "kerstin" (Just 3)
testPair hamming "1011101" "1001001" (Just 2)
testPair hamming "2173896" "2233796" (Just 3)
testPair hamming "toned" "roses" (Just 3)
testPair hamming "red" "wax" (Just 3)
#if __GLASGOW_HASKELL__ >= 710
testPair hamming "a😀c" "abc" (Just 1)
#endif
testPair hamming "lucky" "lucky" (Just 0)
testPair hamming "" "" (Just 0)
testPair hamming "small" "big" Nothing
describe "overlap" $ do
testSwap overlap
testPair overlap "fly" "butterfly" (1 % 1)
testPair overlap "night" "nacht" (3 % 5)
testPair overlap "context" "contact" (5 % 7)
testPair overlap "red" "wax" (0 % 1)
#if __GLASGOW_HASKELL__ >= 710
testPair overlap "a😀c" "abc" (2 % 3)
#endif
testPair overlap "lucky" "lucky" (1 % 1)
describe "jaccard" $ do
testSwap jaccard
testPair jaccard "xxx" "xyx" (1 % 2)
testPair jaccard "night" "nacht" (3 % 7)
testPair jaccard "context" "contact" (5 % 9)
#if __GLASGOW_HASKELL__ >= 710
testPair overlap "a😀c" "abc" (2 % 3)
#endif
testPair jaccard "lucky" "lucky" (1 % 1)
-- | Test that given function returns the same results when order of
-- arguments is swapped.
testSwap :: (Eq a, Show a) => (Text -> Text -> a) -> SpecWith ()
testSwap f = context "if we swap the arguments" $
it "produces the same result" $
property $ \a b -> f a b === f b a
-- | Create spec for given metric function applying it to two 'Text' values
-- and comparing the result with expected one.
testPair :: (Eq a, Show a)
=> (Text -> Text -> a) -- ^ Function to test
-> Text -- ^ First input
-> Text -- ^ Second input
-> a -- ^ Expected result
-> SpecWith ()
testPair f a b r = it ("‘" ++ T.unpack a ++ "’ and ‘" ++ T.unpack b ++ "’") $
f a b `shouldBe` r
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment