Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Surf_2020_tbr_cos_algorithms #61

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Binary file added SURF_Documentation.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions WebContent/filter-controls.jspf
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<%@ page import="in.edu.ashoka.surf.util.Util" %>
<%@ page import="in.edu.ashoka.surf.*" %>
<%
// if we already have filter config, initialize filterSpec, gvc, rvc to the corresponding fields in the config,
// so we can init the dropdowns to the existing config
Expand Down
4 changes: 2 additions & 2 deletions WebContent/read-dataset.jsp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<%@ page import="in.edu.ashoka.surf.*" %>
<%@ page import="edu.stanford.muse.util.Util" %>

<%@ page import="in.edu.ashoka.surf.util.Util" %>
<%-- <%@ page import="edu.stanford.muse.util.Util" %>--%>
<!DOCTYPE html>
<html>
<head>
Expand Down
28 changes: 27 additions & 1 deletion WebContent/select-op.jsp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
<label for="algorithm">Algorithm for clustering <%=mergeCol%></label>
<select class="form-control selectpicker" id="algorithm" name="algorithm">
<option value="editDistance">Edit distance</option>
<!-- <option value="cosinesimilarity">Cosine Similarity</option> -->
<option value="reviewalgo">Review Algorithm</option>
<option value="compatibleNames">Compatible names</option>
<option value="streaks">Streaks</option>
<option value="allNames">All IDs in a single cluster</option>
Expand All @@ -50,7 +52,18 @@
<input type="text" class="form-control" id="edit-distance" name="edit-distance" placeholder="<%=Config.DEFAULT_EDIT_DISTANCE%>">
<span id="edit-distance-0-help" class="help">Edit distance 0 not included</span>
</div>


<div style="display:none" class="div-review-algo">
</div>


<%-- <div style="display:none" class="div-cosine-similarity">
<label for="cosine-similarity">Input value</label>
<input type="text" class="form-control" id="cosine-similarity" name="cosine-similarity" placeholder="<%=Config.DEFAULT_EDIT_DISTANCE%>">
<span id="cosine-similarity-0-help" class="help">Cosine similarity AlGo does not require any user input</span>
</div> --%>


<div class="div-compat-alg-controls">
<div class="div-min-token-overlap">
<label for="min-token-overlap">Token overlap</label>
Expand Down Expand Up @@ -214,6 +227,19 @@
} else {
$('.div-streak-alg-controls').hide();
}

/* if (alg === 'cosinesimilarity') {
$('.div-cosine-similarity').show();
} else {
$('.div-cosine-similarity').hide();
} */

if (alg === 'reviewalgo') {
$('.div-review-algo').show();
} else {
$('.div-review-algo').hide();
}

}

$('#algorithm').change(set_options_for_algorithm);
Expand Down
1 change: 1 addition & 0 deletions src/edu/tsinghua/dbgroup/EditDistanceClusterer.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import java.io.Serializable;
import java.util.Comparator;
import edu.tsinghua.dbgroup.*;

public class EditDistanceClusterer {
private final EditDistanceJoiner mJoiner;
static class SizeComparator implements Comparator<Set<Serializable>> {
Expand Down
1 change: 1 addition & 0 deletions src/edu/tsinghua/dbgroup/EditDistanceJoiner.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import java.util.concurrent.*;

import edu.tsinghua.dbgroup.*;

class EditDistanceJoiner {
private List<String> mStrings;
private final TreeMap<Integer, ArrayList<HashMap<String, ArrayList<Integer>>>> mGlobalIndex;
Expand Down
7 changes: 7 additions & 0 deletions src/in/edu/ashoka/surf/CompatibleNameAlgorithm.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import com.google.common.collect.*;
import in.edu.ashoka.surf.util.Pair;
import in.edu.ashoka.surf.util.Timers;
import in.edu.ashoka.surf.util.UnionFindSet;
import in.edu.ashoka.surf.util.Util;
import org.json.JSONArray;
Expand Down Expand Up @@ -342,9 +343,15 @@ public List<Collection<Row>> run() throws FileNotFoundException {

List<Row> filteredRows = filter.isEmpty() ? (List<Row>) new ArrayList<>(dataset.getRows()) : dataset.getRows().stream().filter(filter::passes).collect(toList());

Timers.CompatibleNameTimer.reset();
Timers.CompatibleNameTimer.start();
// now translate the row#s back to the actual rows
classes = new ArrayList<>();
runRecursive (classes, filteredRows, minTokenOverlap, substringAllowed, initialMapping);

Timers.CompatibleNameTimer.stop();

Timers.log.info ("Time for Compatible Name computation: " + Timers.CompatibleNameTimer.toString());

return classes;
}
Expand Down
1 change: 1 addition & 0 deletions src/in/edu/ashoka/surf/Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ public class Config {
public static String MERGE_FIELD = "Name";
public static final int groupsPerPage = 100;
public static final int DEFAULT_EDIT_DISTANCE = 2;
// public static final int DEFAULT_COSINE_ACCURACY = 90;
public static final int DEFAULT_MIN_TOKEN_OVERLAP = 2;
public static final int DEFAULT_IGNORE_TOKEN_FREQUENCY = 200;
public static final int DEFAULT_MIN_SPLITWEIGHT = 10; // a token in a field will be split only if it's constituent parts have appeared independently > 10 times. (However, there is an additional factor of 2x needed if the fields are only of length 3)
Expand Down
265 changes: 265 additions & 0 deletions src/in/edu/ashoka/surf/CosineFunc.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
package in.edu.ashoka.surf;

import static java.util.stream.Collectors.toList;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.lang3.text.WordUtils;

import in.edu.ashoka.surf.Dataset;
import in.edu.ashoka.surf.Row;

class Node {
String name1;
String name2;
double cosinesimilarity;
int index;

public Node(String name1, String name2, double cosinesimilarity, int index) {
this.name1 = name1;
this.name2 = name2;
this.cosinesimilarity = cosinesimilarity;
this.index = index;
}

public String toString() {
return index + " " + name1 + " " + name2 + " " + cosinesimilarity;
}
}

class obj {

HashMap<Character, Integer> hash;
Set<Character> char_set;
double length;
String word;

public HashMap<Character, Integer> getHash() {
return hash;
}

public void setHash(HashMap<Character, Integer> hash) {
this.hash = hash;
}

public Set<Character> getChar_set() {
return char_set;
}

public void setChar_set(Set<Character> char_set) {
this.char_set = char_set;
}

public double getLength() {
return length;
}

public void setLength(int length) {
this.length = length;
}

public String getWord() {
return word;
}

public void setWord(String word) {
this.word = word;
}

public obj(HashMap<Character, Integer> hash, Set<Character> char_set, double length, String word) {
this.hash = hash;
this.char_set = char_set;
this.length = length;
this.word = word;
}

}

public class CosineFunc {

public static HashMap<Character, Integer> Count(String inputString) {
HashMap<Character, Integer> charCountMap = new HashMap<Character, Integer>();

char[] strArray = inputString.toCharArray();

for (char c : strArray) {
if (charCountMap.containsKey(c)) {

charCountMap.put(c, charCountMap.get(c) + 1);
} else {

charCountMap.put(c, 1);
}
}

return charCountMap;

}

public static obj word2vec(String word) {
HashMap<Character, Integer> count_characters = Count(word);
// System.out.println(count_characters);
Set<Character> set_characters = count_characters.keySet();
// System.out.println(set_characters);

double length = 0;
int key = 0;

for (Integer in : count_characters.values()) {
key += (in * in);
}
length = Math.sqrt(key);
// System.out.println(length);

return new obj(count_characters, set_characters, length, word);

}

public static double cosine_similarity(obj vector1, obj vector2) {
Set<Character> common_characters = new HashSet<Character>(vector1.getChar_set()); // use the copy constructor
common_characters.retainAll(vector2.getChar_set());
// System.out.println("Intersection = " + common_characters);

int product_summation = 0;
for (Character ch : common_characters) {
product_summation += vector1.getHash().get(ch) * vector2.getHash().get(ch);
}
// System.out.println("product_summation = " + product_summation);

double length = vector1.length * vector2.length;
// System.out.println("length = " + length);

if (length == 0) {
return 0;
} else {
return product_summation / length;
}

}

public List<Set<String>> assign_similarity(Collection<Row> filteredRows, String fieldName,double val) {

// HashMap<String, List<String>> map = new HashMap<>();
ArrayList<String> names = new ArrayList<>();
List<Set<String>> resultx = new ArrayList<Set<String>>();
filteredRows.forEach(r -> names.add(r.get(fieldName)));
// filteredRows.forEach(r -> map.put(r.get(fieldName), new ArrayList<>()));
// ArrayList<Node> similar = new ArrayList<>();
boolean visited[] = new boolean[names.size()];

// System.out.println("Map = " + map);

// for (int i = 0; i < names.size(); i++) {
// System.out.println(i + " " + names.get(i));
// }
ArrayList<obj> aa = new ArrayList<>();
for (int i = 0; i < names.size(); i++) {
aa.add(word2vec(names.get(i)));
}

for (int i = 0; i < names.size(); i++) {
String one = names.get(i);
int task = 0;
Set<String> curr = null;
if (visited[i] == false) {
task = 1;
curr = new LinkedHashSet<String>();
visited[i] = true;
curr.add(one);
}
// obj v1 = word2vec(one);
for (int j = i + 1; j < names.size(); j++) {
String two = names.get(j);
// obj v2 = word2vec(two);
double cosine_val = cosine_similarity(aa.get(i),aa.get(j));
// Node nn = new Node(one, two, cosine_similarity(word2vec(one), word2vec(two)), i);
// similar.add(nn);

if (task == 1) {
// System.out.println("hello");
if (cosine_val >= val && visited[j] == false) {
// System.out.println("adi");
curr.add(two);
visited[j] = true;
}
}
}
if (task == 1) {
resultx.add(curr);
}
}
// int l = 0;

return resultx;

// System.out.println("gggggggggggggggggggggg");
// for (int i = 0; i < resultx.size(); i++) {
// l += resultx.get(i).size();
// System.out.println(resultx.get(i));
// }

// System.out.println(l);

// for (String name : map.keySet()) {
// List<String> list = map.get(name);
// list.add(name);
// map.put(name, list);
// }

// for (int i = 0; i < similar.size(); i++) {
// Node node = similar.get(i);
// if (node.cosinesimilarity > 1.0) {
// List<String> set1 = map.get(node.name1);
// set1.add(node.name2);
// map.put(node.name1, set1);
//
// List<String> set2 = map.get(node.name2);
// set2.add(node.name1);
// map.put(node.name2, set2);
// }
// }
// System.out.println("Map = " + map);
//
// Collection<List<String>> result = map.values();
//
// System.out.println("result = " + result);
// System.out.println(result.size());
//
// int le = 0;
// for(List<String> aa : result) {
// System.out.println(aa);
// le += aa.size();
// }
// System.out.println(le);

// System.out.println(similar);

// for (int i = 0; i < similar.size(); i++) {
// System.out.println(similar.get(i));
// }

}

// public static void main(String[] args) throws IOException {
// // TODO Auto-generated method stub
//// String s1 = "adity a";
//// String s2 = "aditya x";
////
//// System.out.println(cosine_similarity(word2vec(s1), word2vec(s2)));
//
// Dataset dataset = Dataset.getDataset(path);
// String fieldName = "Candidate";
// Collection<Row> filteredRows = dataset.getRows().stream().collect(toList());
//
// assign_similarity(filteredRows, fieldName);
//
// }

}