Commit 0225091d authored by qlobbe's avatar qlobbe

remove groups from time matching and work only on ngrams and ids

parent f5393047
...@@ -245,6 +245,9 @@ ngramsToCooc ngrams coocs = ...@@ -245,6 +245,9 @@ ngramsToCooc ngrams coocs =
getGroupId :: PhyloGroup -> PhyloGroupId getGroupId :: PhyloGroup -> PhyloGroupId
getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex) getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex)
idToPrd :: PhyloGroupId -> PhyloPeriodId
idToPrd id = (fst . fst) id
getGroupThr :: PhyloGroup -> Double getGroupThr :: PhyloGroup -> Double
getGroupThr group = head' "getGroupThr" ((group ^. phylo_groupMeta) ! "thr") getGroupThr group = head' "getGroupThr" ((group ^. phylo_groupMeta) ! "thr")
...@@ -286,8 +289,8 @@ getProximityStep proximity = ...@@ -286,8 +289,8 @@ getProximityStep proximity =
-- | Phylo | -- -- | Phylo | --
--------------- ---------------
addPointers :: PhyloGroup -> Filiation -> PointerType -> [Pointer] -> PhyloGroup addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
addPointers group fil pty pointers = addPointers fil pty pointers group =
case pty of case pty of
TemporalPointer -> case fil of TemporalPointer -> case fil of
ToChilds -> group & phylo_groupPeriodChilds .~ pointers ToChilds -> group & phylo_groupPeriodChilds .~ pointers
......
...@@ -74,14 +74,14 @@ weightedLogJaccard' sens nbDocs diago ngrams ngrams' ...@@ -74,14 +74,14 @@ weightedLogJaccard' sens nbDocs diago ngrams ngrams'
-- | To process the proximity between a current group and a pair of targets group -- | To process the proximity between a current group and a pair of targets group
toProximity :: Double -> Map Int Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
toProximity nbDocs diago proximity ego target target' = toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
case proximity of case proximity of
WeightedLogJaccard sens _ _ -> WeightedLogJaccard sens _ _ ->
let targetsNgrams = if target == target' let pairNgrams = if targetNgrams == targetNgrams'
then (target ^. phylo_groupNgrams) then targetNgrams
else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams) else union targetNgrams targetNgrams'
in weightedLogJaccard' sens nbDocs diago (ego ^. phylo_groupNgrams) targetsNgrams in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
Hamming -> undefined Hamming -> undefined
...@@ -96,36 +96,38 @@ findLastPeriod fil periods = case fil of ...@@ -96,36 +96,38 @@ findLastPeriod fil periods = case fil of
-- | To filter pairs of candidates related to old pointers periods -- | To filter pairs of candidates related to old pointers periods
removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId -> [(PhyloGroup,PhyloGroup)] -> [(PhyloGroup,PhyloGroup)] removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId
-> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
-> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
removeOldPointers oldPointers fil thr prox prd pairs removeOldPointers oldPointers fil thr prox prd pairs
| null oldPointers = pairs | null oldPointers = pairs
| null (filterPointers prox thr oldPointers) = | null (filterPointers prox thr oldPointers) =
let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers) let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
in if lastMatchedPrd == prd in if lastMatchedPrd == prd
then [] then []
else filter (\(g,g') -> else filter (\((id,_),(id',_)) ->
case fil of case fil of
ToParents -> ((fst $ g ^. phylo_groupPeriod) < (fst lastMatchedPrd)) ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
|| ((fst $ g' ^. phylo_groupPeriod) < (fst lastMatchedPrd)) || (((fst . fst . fst) id') < (fst lastMatchedPrd))
ToChilds -> ((fst $ g ^. phylo_groupPeriod) > (fst lastMatchedPrd)) ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
|| ((fst $ g' ^. phylo_groupPeriod) > (fst lastMatchedPrd))) pairs || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
| otherwise = [] | otherwise = []
-- | Find pairs of valuable candidates to be matched makePairs' :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity
makePairs' :: PhyloGroup -> [PhyloGroup] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity -> Map Date Double -> Map Date Cooc -> [(PhyloGroup,PhyloGroup)] -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
makePairs' ego candidates periods oldPointers fil thr prox docs diagos = makePairs' (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
case null periods of if (null periods)
True -> [] then []
False -> removeOldPointers oldPointers fil thr prox lastPrd else removeOldPointers oldPointers fil thr prox lastPrd
-- | at least on of the pair candidates should be from the last added period -- | at least on of the pair candidates should be from the last added period
$ filter (\(g,g') -> ((g ^. phylo_groupPeriod) == lastPrd) $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
|| ((g' ^. phylo_groupPeriod) == lastPrd))
$ listToKeys $ listToKeys
$ filter (\g -> let nbDocs = sum $ elems $ (filterDocs docs ([ego ^. phylo_groupPeriod, g ^. phylo_groupPeriod])) $ filter (\(id,ngrams) ->
diago = reduceDiagos $ filterDiago diagos ([ego ^. phylo_groupPeriod, g ^. phylo_groupPeriod]) let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
in (g ^. phylo_groupPeriod == lastPrd) diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
|| ((toProximity nbDocs diago prox ego ego g) >= thr)) candidates in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
) candidates
where where
lastPrd :: PhyloPeriodId lastPrd :: PhyloPeriodId
lastPrd = findLastPeriod fil periods lastPrd = findLastPeriod fil periods
...@@ -140,39 +142,39 @@ reduceDiagos diagos = mapKeys (\(k,_) -> k) ...@@ -140,39 +142,39 @@ reduceDiagos diagos = mapKeys (\(k,_) -> k)
$ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos) $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc -> Double -> PhyloGroup -> PhyloGroup phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
phyloGroupMatching candidates fil proxi docs diagos thr ego = -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
if (null $ filterPointers proxi thr $ getPeriodPointers fil ego) phyloGroupMatching candidates fil proxi docs diagos thr oldPointers (id,ngrams) =
if (null $ filterPointers proxi thr oldPointers)
-- | let's find new pointers -- | let's find new pointers
then if null nextPointers then if null nextPointers
then addPointers ego fil TemporalPointer [] then []
else addPointers ego fil TemporalPointer else head' "phyloGroupMatching"
$ head' "phyloGroupMatching"
-- | Keep only the best set of pointers grouped by proximity -- | Keep only the best set of pointers grouped by proximity
$ groupBy (\pt pt' -> snd pt == snd pt') $ groupBy (\pt pt' -> snd pt == snd pt')
$ reverse $ sortOn snd $ head' "pointers" nextPointers $ reverse $ sortOn snd $ head' "pointers" nextPointers
-- | Find the first time frame where at leats one pointer satisfies the proximity threshold -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
else ego else oldPointers
where where
nextPointers :: [[Pointer]] nextPointers :: [[Pointer]]
nextPointers = take 1 nextPointers = take 1
$ dropWhile (null) $ dropWhile (null)
-- | for each time frame, process the proximity on relevant pairs of targeted groups -- | for each time frame, process the proximity on relevant pairs of targeted groups
$ scanl (\acc groups -> $ scanl (\acc groups ->
let periods = nub $ map _phylo_groupPeriod $ concat groups let periods = nub $ map (fst . fst . fst) $ concat groups
nbdocs = sum $ elems $ (filterDocs docs ([ego ^. phylo_groupPeriod] ++ periods)) nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
diago = reduceDiagos diago = reduceDiagos
$ filterDiago diagos ([ego ^. phylo_groupPeriod] ++ periods) $ filterDiago diagos ([(fst . fst) id] ++ periods)
-- | important resize nbdocs et diago dans le make pairs -- | important resize nbdocs et diago dans le make pairs
pairs = makePairs' ego (concat groups) periods (getPeriodPointers fil ego) fil thr proxi docs diagos pairs = makePairs' (id,ngrams) (concat groups) periods oldPointers fil thr proxi docs diagos
in acc ++ ( filterPointers proxi thr in acc ++ ( filterPointers proxi thr
$ concat $ concat
$ map (\(c,c') -> $ map (\(c,c') ->
-- | process the proximity between the current group and a pair of candidates -- | process the proximity between the current group and a pair of candidates
let proximity = toProximity nbdocs diago proxi ego c c' let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
in if (c == c') in if (c == c')
then [(getGroupId c,proximity)] then [(fst c,proximity)]
else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs )) [] else [(fst c,proximity),(fst c',proximity)] ) pairs )) []
$ inits candidates -- | groups from [[1900],[1900,1901],[1900,1901,1902],...] $ inits candidates -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
...@@ -195,10 +197,10 @@ getNextPeriods fil max' pId pIds = ...@@ -195,10 +197,10 @@ getNextPeriods fil max' pId pIds =
ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
getCandidates :: PhyloGroup -> [[PhyloGroup]] -> [[PhyloGroup]] getCandidates :: PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
getCandidates ego targets = getCandidates ego targets =
map (\groups' -> map (\groups' ->
filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams) filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')
) groups') targets ) groups') targets
...@@ -210,8 +212,8 @@ matchGroupsToGroups frame periods proximity thr docs coocs groups = ...@@ -210,8 +212,8 @@ matchGroupsToGroups frame periods proximity thr docs coocs groups =
periodsPar = getNextPeriods ToParents frame prd periods periodsPar = getNextPeriods ToParents frame prd periods
periodsChi = getNextPeriods ToChilds frame prd periods periodsChi = getNextPeriods ToChilds frame prd periods
-- | 2) find the parents/childs matching candidates -- | 2) find the parents/childs matching candidates
candidatesPar = map (\prd' -> findWithDefault [] prd' groups') periodsPar candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
candidatesChi = map (\prd' -> findWithDefault [] prd' groups') periodsChi candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
-- | 3) find the parents/child number of docs by years -- | 3) find the parents/child number of docs by years
docsPar = filterDocs docs ([prd] ++ periodsPar) docsPar = filterDocs docs ([prd] ++ periodsPar)
docsChi = filterDocs docs ([prd] ++ periodsChi) docsChi = filterDocs docs ([prd] ++ periodsChi)
...@@ -219,8 +221,13 @@ matchGroupsToGroups frame periods proximity thr docs coocs groups = ...@@ -219,8 +221,13 @@ matchGroupsToGroups frame periods proximity thr docs coocs groups =
diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar) diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar) diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
-- | 5) match in parallel all the groups (egos) to their possible candidates -- | 5) match in parallel all the groups (egos) to their possible candidates
egos = map (\ego -> phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar diagoPar thr egos = map (\ego ->
$ phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi diagoChi thr ego) let pointersPar = phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar diagoPar
thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
pointersChi = phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi diagoChi
thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
in addPointers ToChilds TemporalPointer pointersChi
$ addPointers ToParents TemporalPointer pointersPar ego)
$ findWithDefault [] prd groups' $ findWithDefault [] prd groups'
egos' = egos `using` parList rdeepseq egos' = egos `using` parList rdeepseq
in acc ++ egos' in acc ++ egos'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment