【Similarity calculation】Jaccard Distance

cooliufang

浏览: 127598 次

最近访客更多访客>>

gabysen

qiah1120

ZixinA

houwanxing

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Algorithm

java Jaccard distance

Jaccard token distance
It is simply given by the number of common tokens in two names and the count of total number of tokens in those names.

Jaccard distance (simplify)
To reduce the computational complexity

Jaccard distance (weighted)
weighted Jaccard distance is equal to the following expression

then Jaccard similarity function only need to take last past from the above each function.


import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class JaccardDistance {
	public static Map<String, Double> weightMap = new HashMap<String, Double>();

	/**
	 * intersection between two strings
	 * @param source
	 * @param target
	 * @return
	 */
	public static List<String> intersection(String source, String target) {
		List<String> slist = Arrays.asList(source.split(" "));
		List<String> tlist = Arrays.asList(target.split(" "));
		List<String> intersection = new ArrayList<String>();		
		
		for (String s: slist) {
			if (tlist.contains(s)) {
				if (!intersection.contains(s)) {
					intersection.add(s);
				}
			}
		}
		
		return intersection;
	}
	/**
	 * J(s,t) = 1 - intersection(s, t).size()) / (s.size() + t.size() - intersection.size())	 * 
	 * @param source
	 * @param target
	 */
	public static double Jaccard1(String source, String target) {
		List<String> slist = Arrays.asList(source.split(" "));
		List<String> tlist = Arrays.asList(target.split(" "));
		List<String> intersection = intersection(source, target);		
		
		return (double) 1 - intersection.size() / (double)(slist.size() + tlist.size() - intersection.size());
	}
	
	/**
	 * J(s,t) = 1 - 2 * intersection(s, t).size()) / (s.size() + t.size())
	 * @param source
	 * @param target
	 * @return
	 */
	public static double Jaccard2(String source, String target) {
		List<String> slist = Arrays.asList(source.split(" "));
		List<String> tlist = Arrays.asList(target.split(" "));
		List<String> intersection = intersection(source, target);			
		
		return (double) 1 - 2 * intersection.size() / (double)(slist.size() + tlist.size());
	}
	
	/**
	 * J(s,t) each token has weight value.
	 * @param stringList
	 * @param token
	 * @return
	 */
	public static void JaccardWeight(List<String> stringList) {
		Map<String, Integer> freqMap = new HashMap<String, Integer>();
		
		for (String string : stringList) {
			List<String> slist = Arrays.asList(string.split(" "));
			for (String s : slist)  {
				s = s.trim();
				if (freqMap.containsKey(s)) {
					freqMap.put(s, freqMap.get(s)+1);
				} else {
					freqMap.put(s, 1);
				}
			}
		}
		
		for (String key : freqMap.keySet()) {
			int freq = freqMap.get(key);
			double weight = (double) 1 / (Math.log(freq) + 1);
			weightMap.put(key, weight);
		}		
//		return weightMap;		
	}
	
	
	public static double Jaccard3(String source, String target) {
		List<String> slist = Arrays.asList(source.split(" "));
		List<String> tlist = Arrays.asList(target.split(" "));
		List<String> intersection = intersection(source, target);
		
		double intersectionWeight = 0;
		double sourceWeight = 0;
		double targetWeight = 0;
		for (String s : intersection) {
			intersectionWeight += weightMap.get(s);
		}
		for (String s : slist) {
			sourceWeight += weightMap.get(s);
		}
		for (String s: tlist) {
			targetWeight += weightMap.get(s);
		}
		
		return 1 - 2 * intersectionWeight / (sourceWeight + targetWeight);
	}
	
	
	//main
	public static void main(String[] args) {
		String s1 = "AAE HOLDING";
		String s2 = "AAE TECHNOLOGY INTERNATIONAL";
		String s3 = "AGRIPA HOLDING";
		System.out.println("J1(s1, s2) = " + Jaccard1(s1, s2));
		System.out.println("J1(s1, s3) = " + Jaccard1(s1, s3));
		System.out.println("J1(s2, s3) = " + Jaccard1(s2, s3));

		System.out.println();
		System.out.println("J2(s1, s2) = " + Jaccard2(s1, s2));
		System.out.println("J2(s1, s3) = " + Jaccard2(s1, s3));
		System.out.println("J2(s2, s3) = " + Jaccard2(s2, s3));
		
		System.out.println();
		List<String> stringList = new ArrayList<String>();
		Collections.addAll(stringList, s1, s2, s3);
		JaccardWeight(stringList);
		System.out.println(weightMap);
		System.out.println("J3(s1, s2) = " + Jaccard3(s1, s2));
		System.out.println("J3(s1, s3) = " + Jaccard3(s1, s3));
		System.out.println("J3(s2, s3) = " + Jaccard3(s2, s3));
	}

}

Output:
J1(s1, s2) = 0.75
J1(s1, s3) = 0.6666666666666667
J1(s2, s3) = 1.0

J2(s1, s2) = 0.6
J2(s1, s3) = 0.5
J2(s2, s3) = 1.0

{AAE=0.5906161091496412, TECHNOLOGY=1.0, AGRIPA=1.0, INTERNATIONAL=1.0, HOLDING=0.5906161091496412}
J3(s1, s2) = 0.6868293431358082
J3(s1, s3) = 0.5738467337473576
J3(s2, s3) = 1.0