Commit 64f251bc authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

Merge branch 'dev' into dev-test

parents 55f89c13 67c34520
...@@ -41,7 +41,7 @@ import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) ...@@ -41,7 +41,7 @@ import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv') import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich) import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
import Gargantext.Core.Text.Learn (detectLangDefault) -- import Gargantext.Core.Text.Learn (detectLangDefault)
import System.FilePath (FilePath(), takeExtension) import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString as DB import qualified Data.ByteString as DB
import qualified Data.ByteString.Char8 as DBC import qualified Data.ByteString.Char8 as DBC
...@@ -103,11 +103,11 @@ parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff ...@@ -103,11 +103,11 @@ parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff
toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
-- TODO use language for RIS -- TODO use language for RIS
toDoc ff d = do toDoc ff d = do
let abstract = lookup "abstract" d -- let abstract = lookup "abstract" d
let lang = maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract)) let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
pure $ HyperdataDocument (Just $ DT.pack $ show ff) pure $ HyperdataDocument (Just $ DT.pack $ show ff)
......
...@@ -112,7 +112,7 @@ detectLangDefault = detectCat 99 eventLang ...@@ -112,7 +112,7 @@ detectLangDefault = detectCat 99 eventLang
textSample :: Lang -> String textSample :: Lang -> String
textSample EN = EN.textSample textSample EN = EN.textSample
textSample FR = FR.textSample textSample FR = FR.textSample
textSample _ = panic "textSample: not impl yet" textSample _ = panic "[G.C.T.L:detectLangDefault] not impl yet"
--textSample DE = DE.textSample --textSample DE = DE.textSample
--textSample SP = SP.textSample --textSample SP = SP.textSample
--textSample CH = CH.textSample --textSample CH = CH.textSample
......
...@@ -256,41 +256,50 @@ buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do ...@@ -256,41 +256,50 @@ buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
-- splitAt -- splitAt
let let
-- use % of list if to big, or Int if to small -- use % of list if to big, or Int if to small
listSizeLocal = 1000 :: Double mapSize = 1000 :: Double
canSize = mapSize * 10 :: Double
inclSize = 0.4 :: Double inclSize = 0.4 :: Double
exclSize = 1 - inclSize exclSize = 1 - inclSize
splitAt' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * listSizeLocal)) splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
monoInc_size n = splitAt' n $ monoSize * inclSize / 2
multExc_size n = splitAt' n $ multSize * exclSize / 2
(mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
(mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
monoInc_size = splitAt' $ monoSize * inclSize / 2 (mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
(monoScoredInclHead, _monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl (mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
(monoScoredExclHead, _monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
multExc_size = splitAt' $ multSize * exclSize / 2
(multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
(multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
printDebug "stopWords" stopTerms (canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
(canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
(canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
(canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
------------------------------------------------------------ ------------------------------------------------------------
-- Final Step building the Typed list -- Final Step building the Typed list
-- Candidates Terms need to be filtered -- Candidates Terms need to be filtered
let let
maps = setListType (Just MapTerm) maps = setListType (Just MapTerm)
$ monoScoredInclHead $ mapMonoScoredInclHead
<> monoScoredExclHead <> mapMonoScoredExclHead
<> multScoredInclHead <> mapMultScoredInclHead
<> multScoredExclHead <> mapMultScoredExclHead
-- An original way to filter to start with -- An original way to filter to start with
cands = setListType (Just CandidateTerm) cands = setListType (Just CandidateTerm)
$ {- monoScoredInclTail $ canMonoScoredIncHead
<> monoScoredExclTail <> canMonoScoredExclHead
<> -} multScoredInclTail <> canMulScoredInclHead
<> multScoredExclTail <> canMultScoredExclHead
-- TODO count it too
cands' = setListType (Just CandidateTerm) cands' = setListType (Just CandidateTerm)
{-$ groupedMonoTail {-$ groupedMonoTail
<>-} groupedMultTail <>-} groupedMultTail
...@@ -303,6 +312,4 @@ buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do ...@@ -303,6 +312,4 @@ buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
)] )]
] ]
-- printDebug "result" result
pure result pure result
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment