fixes for #48

tcpd · Feb 14, 2018 · 399ccda · 399ccda
1 parent 93fa67b
commit 399ccda
Show file tree

Hide file tree

Showing 3 changed files with 92 additions and 30 deletions.
diff --git a/WebContent/table.jsp b/WebContent/table.jsp
@@ -375,6 +375,7 @@ import="java.util.*"
             $group.removeClass('reviewed')
             $('.reviewed-button', $group).text ('Mark as reviewed');
         }
+        window.last_name = ($($('td', $group)[2]).text());
     }
 
     function group_reviewed_handler (e) {
@@ -539,7 +540,7 @@ import="java.util.*"
                     $spinner.fadeOut();
                     if (o && o.status == 0) {
                         // could perhaps display a toast here
-                        window.location.reload();
+                        window.location = 'table?page=' + getParameterByName('page', window.location.href) + '&scrollTo=' + escape(window.last_name);
                     } else {
                         alert('Save failed!');
                     }
@@ -574,11 +575,11 @@ import="java.util.*"
                 if (o && o.status == 0) {
                     // could perhaps display a toast here
                 } else {
-                    alert('Filter failed!');
+                    alert('Merge failed!');
                 }
-                window.location = 'table?page=1';
+                window.location = 'table?page=1&scrollTo=' + escape(window.last_name);
             },
-            error: function (jqXHR, textStatus, errorThrown) { $spinner.fadeOut(); alert ('Warning: filter failed! ' + textStatus + ' ' + jqXHR.responseText);}
+            error: function (jqXHR, textStatus, errorThrown) { $spinner.fadeOut(); alert ('Warning: Merge failed! ' + textStatus + ' ' + jqXHR.responseText);}
         });
     }
 
@@ -600,6 +601,15 @@ import="java.util.*"
         );
     }
 
+    function getParameterByName(name, url) {
+        if (!url) url = window.location.href;
+        name = name.replace(/[\[\]]/g, "\\$&");
+        var regex = new RegExp("[?&]" + name + "(=([^&#]*)|&|#|$)"),
+            results = regex.exec(url);
+        if (!results) return null;
+        if (!results[2]) return '';
+        return decodeURIComponent(results[2].replace(/\+/g, " "));
+    }
 
     $('.select-button').click (select_all_handler);
     $('.select-till-here-button').click (select_till_here_handler);
@@ -614,10 +624,32 @@ import="java.util.*"
     $('.filter-button').click (function() { $('#filterModal').modal();});
     $('.filter-submit-button').click (filter_submit_handler);
     $('.help-button').click (function() { $('#helpModal').modal()});
-    $('input.select-checkbox').click (function(e) {
-        var $target = $(event.target);
-        alert($target.next().text());
-    });
+
+    // try to scroll to area that was last clicked on the merge page
+    {
+        window.last_name = '';// this will track the name of the cell val next to the checkbox, in the last checkbox clicked
+        $('input.select-checkbox').click(function (e) {
+            var $target = $(event.target);
+            window.last_name = $target.closest('td').next().text()
+        });
+
+        var scrollToText = getParameterByName('scrollTo', window.location.href);
+
+        if (scrollToText) {
+            scrollToText = scrollToText.toLowerCase();
+            $('td').each(function (i, elem) {
+                var text = $(elem).text().toLowerCase();
+                if (text.indexOf(scrollToText) >= 0) {
+                    elem.scrollIntoView();
+                    alert ('scrolling to ' + scrollToText);
+                    return false;
+                }
+                return true;
+            });
+        }
+        ;
+    }
+
 
 </script>
 </body>

diff --git a/src/in/edu/ashoka/surf/Config.java b/src/in/edu/ashoka/surf/Config.java
@@ -29,30 +29,13 @@ public class Config {
     public static final int DEFAULT_EDIT_DISTANCE = 2;
     public static final int DEFAULT_MIN_TOKEN_OVERLAP = 2;
     public static final int DEFAULT_IGNORE_TOKEN_FREQUENCY = 200;
+    public static final int DEFAULT_MIN_SPLITWEIGHT = 10; // a token in a field will be split only if it's constituent parts have appeared independently > 10 times. (However, there is an additional factor of 2x needed if the fields are only of length 3)
 
     /** SEE ALSO: we could refer to Metaphone 3 https://en.wikipedia.org/wiki/Metaphone#Metaphone_3 */
     static String[] replacements = new String[]{
             "[^A-Za-z\\s]", "",
-            "NAYAK", "NAIK",
-            "IYA", "IA", // # RAJORIA vs RAJORIYA
-            "AGRA", "AGAR", // AGRAWAL vs AGARWAL
-            "KER", "KAR", // SONKAR vs SONKER
-            "HAR", "HR", // e.g. VOHARA vs VOHRA
-         //   "HAT", "HT", // e.g. MAHATAB vs MAHTAB, but this breaks BHAT and makes it BT
-            "RAT", "RT", // e.g. BHARATENDRA vs BHARTENDRA
-            "RAJ", "RJ", // e.g. NEERJA vs NEERAJA
-            "SAL", "SL", // e.g. BHONSALE vs BHONSLE
-            "NAG", "NG", // e.g. WANAGE vs WANGE. Why, even HANAGAL vs HANGAL
 
-            // suffix removal, do this before phonetic conversions
-            "BAI$", "",
-            "BHAI$", "",
-            "BEN$", "",
-            "JI$", "",
-            "LAL$", "",
-            "KUMAR$", "",
-
-            // phonetic corrections
+            // remove aspirations. these should happen before things like RAT=>RT, etc. e.g DASHARATHA => DASARATA -> DASARAT
             "TH", "T",
             "V", "W",
             "GH", "G",
@@ -64,8 +47,35 @@ public class Config {
             "PH", "F",
             "SH", "S",
             "JH", "Z", // JHAVERI vs ZAVERI
+
+            // safe replacements
             "Z", "S",
             "Y", "I",
+
+            "NAYAK", "NAIK",
+            "IYA", "IA", // # RAJORIA vs RAJORIYA
+            "AGRA", "AGAR", // AGRAWAL vs AGARWAL
+            "KER", "KAR", // SONKAR vs SONKER
+            "HAR", "HR", // e.g. VOHARA vs VOHRA
+         //   "HAT", "HT", // e.g. MAHATAB vs MAHTAB, but this breaks BHAT and makes it BT
+            "RAT", "RT", // e.g. BHARATENDRA vs BHARTENDRA
+            "RAJ", "RJ", // e.g. NEERJA vs NEERAJA
+            "SAL", "SL", // e.g. BHONSALE vs BHONSLE
+            "(.)NAG(.)", "$1NG$2", // e.g. WANAGE vs WANGE. Why, even HANAGAL vs HANGAL. but only if it's in the middle of a word. We don't want to convert NAG to NG or ANANTNAG to ANANTNG NAGALAND to NGALAND
+
+            // could we convert the above to a general rule like consonant-vowel-consonant in the middle of a word can be converted to consonant-consonant, esp. if the vowel is A, E, U
+
+            // suffix removal, do this before phonetic conversions.
+            // but only if it has a minimum length. we don't want to replace SUKUMAR with SU or DULAL with DU.
+            "(...)BAI$", "$1",
+            "(...)BHAI$", "$1",
+            "(...)BEN$", "$1",
+            "(...)JI$", "$1",
+            "(...)LAL$", "$1",
+            "(...)KUMAR$", "$1",
+
+            // phonetic corrections
+
             "AU", "OU",
             "OO", "U",
             "EE", "I",

diff --git a/src/in/edu/ashoka/surf/Tokenizer.java b/src/in/edu/ashoka/surf/Tokenizer.java
@@ -8,6 +8,7 @@
 import com.google.common.collect.Multimap;
 import com.google.common.collect.Multiset;
 
+import com.google.common.collect.Multisets;
 import edu.stanford.muse.util.Util;
 import in.edu.ashoka.surf.util.Timers;
 import org.apache.commons.logging.Log;
@@ -46,6 +47,11 @@ private static Multiset<String> generateTokens(Multimap<String, Row> map) {
             }
         }
         log.info ("single word keys: " + nSingleWordKeys + " total tokens: " + tokens.size() + " unique: " + tokens.elementSet().size());
+
+        int i = 0;
+        for (String s: Multisets.copyHighestCountFirst(tokens).elementSet())
+            log.info (++i + " " + s + " " + tokens.count(s));
+
         return tokens;
     }
 
@@ -62,13 +68,23 @@ private static List<String> retokenize (String s, Multiset<String> validTokens)
     private static List<String> splitToken (String s, Multiset<String> validTokens) {
         List<String> result = new ArrayList<>();
 
-        int bestSplit = 4; // min. splitweight
+        int bestSplit = Config.DEFAULT_MIN_SPLITWEIGHT; // min. splitweight
         String bestFirst = "", bestSecond = "";
         for (int i = 2; i <= s.length()-2; i++) {
             String first = s.substring(0, i);
             String second = s.substring(i);
+
+
             // first and second must be both at least 2 chars long
             int splitWeight = validTokens.count(first) + validTokens.count(second);
+
+            // we penalize splits that result in tokens of length just 2
+            // this is to prevent DUTA breaking up into DU TA
+            if (i == 2)
+                splitWeight /= 2;
+            if (i == s.length()-3)
+                splitWeight /= 2;
+
             if (splitWeight > bestSplit)
             {
                 bestSplit = splitWeight;
@@ -89,7 +105,12 @@ private static List<String> splitToken (String s, Multiset<String> validTokens)
     }
 
     /** sets up canonicalized, retokenized and sorted-retokenized versions of the given field.
-     * e.g if field is Name, fields called _c_Name, _t_Name and _st_Name are added to all rows */
+     * e.g if field is Name, fields called _c_Name, _t_Name and _st_Name are added to all rows.
+     * We have to be careful about tokenization. It can have a disproportionate effect.
+     * BIREN DUTA => BIREN DUTA(canonicalization) => BI REN DU TA (retokenized) => BI REN DU TA (sorted) => BIDURENTA
+     * BIREN DATA => BIREN DATA => BIREN DATA => BIDATAREN
+     * These strings after sorting are very far away after tokenization and sorting.
+     * */
     static void setupDesiVersions(Collection<Row> allRows, String field)
     {
         String cfield = "_c_" + field;
@@ -161,7 +182,6 @@ public static String canonicalizeDesi(String s)
 
         StringBuilder result = new StringBuilder();
 
-        // these are from Gilles
         List<String> tokens = Util.tokenize(s, " \t."); // very important to always tokenize on periods. M.F. SOLANKI should become M F SOLANKI, not MF SOLANKI
         for (int i=0; i<tokens.size(); i++) {
             String token = tokens.get(i);