Skip to content

Commit

Permalink
fixes for #48
Browse files Browse the repository at this point in the history
  • Loading branch information
hangal committed Feb 14, 2018
1 parent 93fa67b commit 399ccda
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 30 deletions.
48 changes: 40 additions & 8 deletions WebContent/table.jsp
Expand Up @@ -375,6 +375,7 @@ import="java.util.*"
$group.removeClass('reviewed')
$('.reviewed-button', $group).text ('Mark as reviewed');
}
window.last_name = ($($('td', $group)[2]).text());
}
function group_reviewed_handler (e) {
Expand Down Expand Up @@ -539,7 +540,7 @@ import="java.util.*"
$spinner.fadeOut();
if (o && o.status == 0) {
// could perhaps display a toast here
window.location.reload();
window.location = 'table?page=' + getParameterByName('page', window.location.href) + '&scrollTo=' + escape(window.last_name);
} else {
alert('Save failed!');
}
Expand Down Expand Up @@ -574,11 +575,11 @@ import="java.util.*"
if (o && o.status == 0) {
// could perhaps display a toast here
} else {
alert('Filter failed!');
alert('Merge failed!');
}
window.location = 'table?page=1';
window.location = 'table?page=1&scrollTo=' + escape(window.last_name);
},
error: function (jqXHR, textStatus, errorThrown) { $spinner.fadeOut(); alert ('Warning: filter failed! ' + textStatus + ' ' + jqXHR.responseText);}
error: function (jqXHR, textStatus, errorThrown) { $spinner.fadeOut(); alert ('Warning: Merge failed! ' + textStatus + ' ' + jqXHR.responseText);}
});
}
Expand All @@ -600,6 +601,15 @@ import="java.util.*"
);
}
function getParameterByName(name, url) {
if (!url) url = window.location.href;
name = name.replace(/[\[\]]/g, "\\$&");
var regex = new RegExp("[?&]" + name + "(=([^&#]*)|&|#|$)"),
results = regex.exec(url);
if (!results) return null;
if (!results[2]) return '';
return decodeURIComponent(results[2].replace(/\+/g, " "));
}
$('.select-button').click (select_all_handler);
$('.select-till-here-button').click (select_till_here_handler);
Expand All @@ -614,10 +624,32 @@ import="java.util.*"
$('.filter-button').click (function() { $('#filterModal').modal();});
$('.filter-submit-button').click (filter_submit_handler);
$('.help-button').click (function() { $('#helpModal').modal()});
$('input.select-checkbox').click (function(e) {
var $target = $(event.target);
alert($target.next().text());
});
// try to scroll to area that was last clicked on the merge page
{
window.last_name = '';// this will track the name of the cell val next to the checkbox, in the last checkbox clicked
$('input.select-checkbox').click(function (e) {
var $target = $(event.target);
window.last_name = $target.closest('td').next().text()
});
var scrollToText = getParameterByName('scrollTo', window.location.href);
if (scrollToText) {
scrollToText = scrollToText.toLowerCase();
$('td').each(function (i, elem) {
var text = $(elem).text().toLowerCase();
if (text.indexOf(scrollToText) >= 0) {
elem.scrollIntoView();
alert ('scrolling to ' + scrollToText);
return false;
}
return true;
});
}
;
}
</script>
</body>
Expand Down
48 changes: 29 additions & 19 deletions src/in/edu/ashoka/surf/Config.java
Expand Up @@ -29,30 +29,13 @@ public class Config {
public static final int DEFAULT_EDIT_DISTANCE = 2;
public static final int DEFAULT_MIN_TOKEN_OVERLAP = 2;
public static final int DEFAULT_IGNORE_TOKEN_FREQUENCY = 200;
public static final int DEFAULT_MIN_SPLITWEIGHT = 10; // a token in a field will be split only if it's constituent parts have appeared independently > 10 times. (However, there is an additional factor of 2x needed if the fields are only of length 3)

/** SEE ALSO: we could refer to Metaphone 3 https://en.wikipedia.org/wiki/Metaphone#Metaphone_3 */
static String[] replacements = new String[]{
"[^A-Za-z\\s]", "",
"NAYAK", "NAIK",
"IYA", "IA", // # RAJORIA vs RAJORIYA
"AGRA", "AGAR", // AGRAWAL vs AGARWAL
"KER", "KAR", // SONKAR vs SONKER
"HAR", "HR", // e.g. VOHARA vs VOHRA
// "HAT", "HT", // e.g. MAHATAB vs MAHTAB, but this breaks BHAT and makes it BT
"RAT", "RT", // e.g. BHARATENDRA vs BHARTENDRA
"RAJ", "RJ", // e.g. NEERJA vs NEERAJA
"SAL", "SL", // e.g. BHONSALE vs BHONSLE
"NAG", "NG", // e.g. WANAGE vs WANGE. Why, even HANAGAL vs HANGAL

// suffix removal, do this before phonetic conversions
"BAI$", "",
"BHAI$", "",
"BEN$", "",
"JI$", "",
"LAL$", "",
"KUMAR$", "",

// phonetic corrections
// remove aspirations. these should happen before things like RAT=>RT, etc. e.g DASHARATHA => DASARATA -> DASARAT
"TH", "T",
"V", "W",
"GH", "G",
Expand All @@ -64,8 +47,35 @@ public class Config {
"PH", "F",
"SH", "S",
"JH", "Z", // JHAVERI vs ZAVERI

// safe replacements
"Z", "S",
"Y", "I",

"NAYAK", "NAIK",
"IYA", "IA", // # RAJORIA vs RAJORIYA
"AGRA", "AGAR", // AGRAWAL vs AGARWAL
"KER", "KAR", // SONKAR vs SONKER
"HAR", "HR", // e.g. VOHARA vs VOHRA
// "HAT", "HT", // e.g. MAHATAB vs MAHTAB, but this breaks BHAT and makes it BT
"RAT", "RT", // e.g. BHARATENDRA vs BHARTENDRA
"RAJ", "RJ", // e.g. NEERJA vs NEERAJA
"SAL", "SL", // e.g. BHONSALE vs BHONSLE
"(.)NAG(.)", "$1NG$2", // e.g. WANAGE vs WANGE. Why, even HANAGAL vs HANGAL. but only if it's in the middle of a word. We don't want to convert NAG to NG or ANANTNAG to ANANTNG NAGALAND to NGALAND

// could we convert the above to a general rule like consonant-vowel-consonant in the middle of a word can be converted to consonant-consonant, esp. if the vowel is A, E, U

// suffix removal, do this before phonetic conversions.
// but only if it has a minimum length. we don't want to replace SUKUMAR with SU or DULAL with DU.
"(...)BAI$", "$1",
"(...)BHAI$", "$1",
"(...)BEN$", "$1",
"(...)JI$", "$1",
"(...)LAL$", "$1",
"(...)KUMAR$", "$1",

// phonetic corrections

"AU", "OU",
"OO", "U",
"EE", "I",
Expand Down
26 changes: 23 additions & 3 deletions src/in/edu/ashoka/surf/Tokenizer.java
Expand Up @@ -8,6 +8,7 @@
import com.google.common.collect.Multimap;
import com.google.common.collect.Multiset;

import com.google.common.collect.Multisets;
import edu.stanford.muse.util.Util;
import in.edu.ashoka.surf.util.Timers;
import org.apache.commons.logging.Log;
Expand Down Expand Up @@ -46,6 +47,11 @@ private static Multiset<String> generateTokens(Multimap<String, Row> map) {
}
}
log.info ("single word keys: " + nSingleWordKeys + " total tokens: " + tokens.size() + " unique: " + tokens.elementSet().size());

int i = 0;
for (String s: Multisets.copyHighestCountFirst(tokens).elementSet())
log.info (++i + " " + s + " " + tokens.count(s));

return tokens;
}

Expand All @@ -62,13 +68,23 @@ private static List<String> retokenize (String s, Multiset<String> validTokens)
private static List<String> splitToken (String s, Multiset<String> validTokens) {
List<String> result = new ArrayList<>();

int bestSplit = 4; // min. splitweight
int bestSplit = Config.DEFAULT_MIN_SPLITWEIGHT; // min. splitweight
String bestFirst = "", bestSecond = "";
for (int i = 2; i <= s.length()-2; i++) {
String first = s.substring(0, i);
String second = s.substring(i);


// first and second must be both at least 2 chars long
int splitWeight = validTokens.count(first) + validTokens.count(second);

// we penalize splits that result in tokens of length just 2
// this is to prevent DUTA breaking up into DU TA
if (i == 2)
splitWeight /= 2;
if (i == s.length()-3)
splitWeight /= 2;

if (splitWeight > bestSplit)
{
bestSplit = splitWeight;
Expand All @@ -89,7 +105,12 @@ private static List<String> splitToken (String s, Multiset<String> validTokens)
}

/** sets up canonicalized, retokenized and sorted-retokenized versions of the given field.
* e.g if field is Name, fields called _c_Name, _t_Name and _st_Name are added to all rows */
* e.g if field is Name, fields called _c_Name, _t_Name and _st_Name are added to all rows.
* We have to be careful about tokenization. It can have a disproportionate effect.
* BIREN DUTA => BIREN DUTA(canonicalization) => BI REN DU TA (retokenized) => BI REN DU TA (sorted) => BIDURENTA
* BIREN DATA => BIREN DATA => BIREN DATA => BIDATAREN
* These strings after sorting are very far away after tokenization and sorting.
* */
static void setupDesiVersions(Collection<Row> allRows, String field)
{
String cfield = "_c_" + field;
Expand Down Expand Up @@ -161,7 +182,6 @@ public static String canonicalizeDesi(String s)

StringBuilder result = new StringBuilder();

// these are from Gilles
List<String> tokens = Util.tokenize(s, " \t."); // very important to always tokenize on periods. M.F. SOLANKI should become M F SOLANKI, not MF SOLANKI
for (int i=0; i<tokens.size(); i++) {
String token = tokens.get(i);
Expand Down

0 comments on commit 399ccda

Please sign in to comment.