Commit 64f251bc authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

Merge branch 'dev' into dev-test

parents 55f89c13 67c34520
......@@ -41,7 +41,7 @@ import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
import Gargantext.Core.Text.Learn (detectLangDefault)
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString as DB
import qualified Data.ByteString.Char8 as DBC
......@@ -103,11 +103,11 @@ parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff
toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
-- TODO use language for RIS
toDoc ff d = do
let abstract = lookup "abstract" d
let lang = maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
-- let abstract = lookup "abstract" d
let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
pure $ HyperdataDocument (Just $ DT.pack $ show ff)
......
......@@ -112,7 +112,7 @@ detectLangDefault = detectCat 99 eventLang
textSample :: Lang -> String
textSample EN = EN.textSample
textSample FR = FR.textSample
textSample _ = panic "textSample: not impl yet"
textSample _ = panic "[G.C.T.L:detectLangDefault] not impl yet"
--textSample DE = DE.textSample
--textSample SP = SP.textSample
--textSample CH = CH.textSample
......
......@@ -256,41 +256,50 @@ buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
-- splitAt
let
-- use % of list if to big, or Int if to small
listSizeLocal = 1000 :: Double
mapSize = 1000 :: Double
canSize = mapSize * 10 :: Double
inclSize = 0.4 :: Double
exclSize = 1 - inclSize
splitAt' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
monoInc_size n = splitAt' n $ monoSize * inclSize / 2
multExc_size n = splitAt' n $ multSize * exclSize / 2
(mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
(mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
monoInc_size = splitAt' $ monoSize * inclSize / 2
(monoScoredInclHead, _monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl
(monoScoredExclHead, _monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
(mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
(mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
multExc_size = splitAt' $ multSize * exclSize / 2
(multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
(multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
printDebug "stopWords" stopTerms
(canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
(canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
(canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
(canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
------------------------------------------------------------
-- Final Step building the Typed list
-- Candidates Terms need to be filtered
let
maps = setListType (Just MapTerm)
$ monoScoredInclHead
<> monoScoredExclHead
<> multScoredInclHead
<> multScoredExclHead
$ mapMonoScoredInclHead
<> mapMonoScoredExclHead
<> mapMultScoredInclHead
<> mapMultScoredExclHead
-- An original way to filter to start with
cands = setListType (Just CandidateTerm)
$ {- monoScoredInclTail
<> monoScoredExclTail
<> -} multScoredInclTail
<> multScoredExclTail
cands = setListType (Just CandidateTerm)
$ canMonoScoredIncHead
<> canMonoScoredExclHead
<> canMulScoredInclHead
<> canMultScoredExclHead
-- TODO count it too
cands' = setListType (Just CandidateTerm)
{-$ groupedMonoTail
<>-} groupedMultTail
......@@ -303,6 +312,4 @@ buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
)]
]
-- printDebug "result" result
pure result
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment