();

+

+ StringBuffer buffer = new StringBuffer();

- //Step Two: Shrink.

- for(int i=0;i0 ? 1 : 0;

- if(bit == 1){

- result |= 1 << (HASH_LENGTH-1-i);

+ for (int i = 0; i < this.intSimHash.bitLength(); i++) {

+ // 褰撲笖浠呭綋璁剧疆浜嗘寚瀹氱殑浣嶆椂锛岃繑鍥� true

+ boolean sr = simHash.intSimHash.testBit(i);

+

+ if (sr) {

+ buffer.append("1");

+ } else {

+ buffer.append("0");

+ }

+

+ if ((i + 1) % numEach == 0) {

+ // 灏嗕簩杩涘埗杞负BigInteger

+ BigInteger eachValue = new BigInteger(buffer.toString(), 2);

+ System.out.println("----" + eachValue);

+ buffer.delete(0, buffer.length());

+ characters.add(eachValue);

}

}

- System.out.println("String \""+str+ "\" hashcode is:"+result

- +". Binary format is: "+Integer.toBinaryString(result));

- return result;

+

+ return characters;

}

-}

+ public static void main(String[] args) throws IOException {

+ String s = "浼犵粺鐨� hash 绠楁硶鍙礋璐e皢鍘熷鍐呭灏介噺鍧囧寑闅忔満鍦版槧灏勪负涓�涓鍚嶅�硷紝" + "鍘熺悊涓婄浉褰撲簬浼殢鏈烘暟浜х敓绠楁硶銆備骇鐢熺殑涓や釜绛惧悕锛屽鏋滅浉绛夛紝璇存槑鍘熷鍐呭鍦ㄤ竴瀹氭 鐜� 涓嬫槸鐩哥瓑鐨勶紱"

+ + "濡傛灉涓嶇浉绛夛紝闄や簡璇存槑鍘熷鍐呭涓嶇浉绛夊锛屼笉鍐嶆彁渚涗换浣曚俊鎭紝鍥犱负鍗充娇鍘熷鍐呭鍙浉宸竴涓瓧鑺傦紝" + "鎵�浜х敓鐨勭鍚嶄篃寰堝彲鑳藉樊鍒瀬澶с�備粠杩欎釜鎰忎箟 涓婃潵 璇达紝瑕佽璁′竴涓� hash 绠楁硶锛�"

+ + "瀵圭浉浼肩殑鍐呭浜х敓鐨勭鍚嶄篃鐩歌繎锛屾槸鏇翠负鑹伴毦鐨勪换鍔★紝鍥犱负瀹冪殑绛惧悕鍊奸櫎浜嗘彁渚涘師濮嬪唴瀹规槸鍚︾浉绛夌殑淇℃伅澶栵紝" + "杩樿兘棰濆鎻愪緵涓嶇浉绛夌殑 鍘熷鍐呭鐨勫樊寮傜▼搴︾殑淇℃伅銆�";

+ SimHash hash1 = new SimHash(s, 64);

+ System.out.println(hash1.intSimHash + " " + hash1.intSimHash.bitLength());

+ // 璁$畻 娴锋槑璺濈 鍦� 3 浠ュ唴鐨勫悇鍧楃鍚嶇殑 hash 鍊�

+ hash1.subByDistance(hash1, 3);

+

+ // 鍒犻櫎棣栧彞璇濓紝骞跺姞鍏ヤ袱涓共鎵颁覆

+ s = "鍘熺悊涓婄浉褰撲簬浼殢鏈烘暟浜х敓绠楁硶銆備骇鐢熺殑涓や釜绛惧悕锛屽鏋滅浉绛夛紝璇存槑鍘熷鍐呭鍦ㄤ竴瀹氭 鐜� 涓嬫槸鐩哥瓑鐨勶紱"

+ + "濡傛灉涓嶇浉绛夛紝闄や簡璇存槑鍘熷鍐呭涓嶇浉绛夊锛屼笉鍐嶆彁渚涗换浣曚俊鎭紝鍥犱负鍗充娇鍘熷鍐呭鍙浉宸竴涓瓧鑺傦紝" + "鎵�浜х敓鐨勭鍚嶄篃寰堝彲鑳藉樊鍒瀬澶с�備粠杩欎釜鎰忎箟 涓婃潵 璇达紝瑕佽璁′竴涓� hash 绠楁硶锛�"

+ + "瀵圭浉浼肩殑鍐呭浜х敓鐨勭鍚嶄篃鐩歌繎锛屾槸鏇翠负鑹伴毦鐨勪换鍔★紝鍥犱负瀹冪殑绛惧悕鍊奸櫎浜嗘彁渚涘師濮嬪唴瀹规槸鍚︾浉绛夌殑淇℃伅澶栵紝" + "骞叉壈1杩樿兘棰濆鎻愪緵涓嶇浉绛夌殑 鍘熷鍐呭鐨勫樊寮傜▼搴︾殑淇℃伅銆�";

+ SimHash hash2 = new SimHash(s, 64);

+ System.out.println(hash2.intSimHash + " " + hash2.intSimHash.bitCount());

+ hash1.subByDistance(hash2, 3);

+

+ // 棣栧彞鍓嶆坊鍔犱竴鍙ヨ瘽锛屽苟鍔犲叆鍥涗釜骞叉壈涓�

+ s = "imhash绠楁硶鐨勮緭鍏ユ槸涓�涓悜閲忥紝杈撳嚭鏄竴涓� f 浣嶇殑绛惧悕鍊笺�備负浜嗛檲杩版柟渚匡紝" + "鍋囪杈撳叆鐨勬槸涓�涓枃妗g殑鐗瑰緛闆嗗悎锛屾瘡涓壒寰佹湁涓�瀹氱殑鏉冮噸銆�"

+ + "浼犵粺骞叉壈4鐨� hash 绠楁硶鍙礋璐e皢鍘熷鍐呭灏介噺鍧囧寑闅忔満鍦版槧灏勪负涓�涓鍚嶅�硷紝" + "鍘熺悊涓婅繖娆″樊寮傛湁澶氬ぇ鍛�3鐩稿綋浜庝吉闅忔満鏁颁骇鐢熺畻娉曘�備骇鐢熺殑涓や釜绛惧悕锛屽鏋滅浉绛夛紝"

+ + "璇存槑鍘熷鍐呭鍦ㄤ竴瀹氭 鐜� 涓嬫槸鐩哥瓑鐨勶紱濡傛灉涓嶇浉绛夛紝闄や簡璇存槑鍘熷鍐呭涓嶇浉绛夊锛屼笉鍐嶆彁渚涗换浣曚俊鎭紝" + "鍥犱负鍗充娇鍘熷鍐呭鍙浉宸竴涓瓧鑺傦紝鎵�浜х敓鐨勭鍚嶄篃寰堝彲鑳藉樊鍒瀬澶с�備粠杩欎釜鎰忎箟 涓婃潵 璇达紝"

+ + "瑕佽璁′竴涓� hash 绠楁硶锛屽鐩镐技鐨勫唴瀹逛骇鐢熺殑绛惧悕涔熺浉杩戯紝鏄洿涓鸿壈闅剧殑浠诲姟锛屽洜涓哄畠鐨勭鍚嶅�奸櫎浜嗘彁渚涘師濮�" + "鍐呭鏄惁鐩哥瓑鐨勪俊鎭锛屽共鎵�1杩樿兘棰濆鎻愪緵涓嶇浉绛夌殑 鍘熷鍐嶆潵骞叉壈2鍐呭鐨勫樊寮傜▼搴︾殑淇℃伅銆�";

+ SimHash hash3 = new SimHash(s, 64);

+ System.out.println(hash3.intSimHash + " " + hash3.intSimHash.bitCount());

+ hash1.subByDistance(hash3, 3);

+

+ System.out.println("============================");

+

+ int dis = hash1.getDistance(hash1.strSimHash, hash2.strSimHash);

+ System.out.println(hash1.hammingDistance(hash2) + " " + dis);

+ // 鏍规嵁楦藉发鍘熺悊锛堜篃鎴愭娊灞夊師鐞嗭紝瑙佺粍鍚堟暟瀛︼級锛屽鏋滀袱涓鍚嶇殑娴锋槑璺濈鍦� 3 浠ュ唴锛屽畠浠繀鏈変竴鍧楃鍚峴ubByDistance()瀹屽叏鐩稿悓銆�

+ int dis2 = hash1.getDistance(hash1.strSimHash, hash3.strSimHash);

+ System.out.println(hash1.hammingDistance(hash3) + " " + dis2);

+ }

+}

\ No newline at end of file

diff --git a/quick-simhash/src/main/java/com/quick/simhash/SimHashTool.java b/quick-simhash/src/main/java/com/quick/simhash/SimHashTool.java

new file mode 100644

index 0000000..de6812b

--- /dev/null

+++ b/quick-simhash/src/main/java/com/quick/simhash/SimHashTool.java

@@ -0,0 +1,241 @@

+package com.quick.simhash;

+

+import java.io.BufferedReader;

+import java.io.File;

+import java.io.FileReader;

+import java.io.IOException;

+

+/**

+ * 鐩镐技鍝堝笇绠楁硶宸ュ叿绫�

+ *

+ * @author lyq

+ *

+ */

+public class SimHashTool {

+ // 浜岃繘鍒跺搱甯屼綅鏁�

+ private int hashBitNum;

+ // 鐩稿悓浣嶆暟鏈�灏忛槇鍊�

+ private double minSupportValue;

+

+ public SimHashTool(int hashBitNum, double minSupportValue) {

+ this.hashBitNum = hashBitNum;

+ this.minSupportValue = minSupportValue;

+ }

+

+ /**

+ * 姣旇緝鏂囩珷鐨勭浉浼煎害

+ *

+ * @param newsPath1

+ * 鏂囩珷璺緞1

+ * @param newsPath2

+ * 鏂囩珷璺緞2

+ */

+ public void compareArticals(String newsPath1, String newsPath2) {

+ String content1;

+ String content2;

+ int sameNum;

+ int[] hashArray1;

+ int[] hashArray2;

+

+

+ // 璇诲彇鍒嗚瘝缁撴灉

+ content1 = readDataFile(newsPath1);

+ content2 = readDataFile(newsPath2);

+ hashArray1 = calSimHashValue(content1);

+ hashArray2 = calSimHashValue(content2);

+

+ // 姣旇緝鍝堝笇浣嶆暟鐩稿悓涓暟

+ sameNum = 0;

+ for (int i = 0; i < hashBitNum; i++) {

+ if (hashArray1[i] == hashArray2[i]) {

+ sameNum++;

+ }

+ }

+

+ // 涓庢渶灏忛槇鍊艰繘琛屾瘮杈�

+ if (sameNum > this.hashBitNum * this.minSupportValue) {

+ System.out.println(String.format("鐩镐技搴︿负%s,瓒呰繃闃堝��%s,鎵�浠ユ柊闂�1涓庢柊闂�2鏄浉浼肩殑",

+ sameNum * 1.0 / hashBitNum, minSupportValue));

+ } else {

+ System.out.println(String.format("鐩镐技搴︿负%s,灏忎簬闃堝��%s,鎵�浠ユ柊闂�1涓庢柊闂�2涓嶆槸鐩镐技鐨�",

+ sameNum * 1.0 / hashBitNum, minSupportValue));

+ }

+ }

+

+ /**

+ * 璁$畻鏂囨湰鐨勭浉浼煎搱甯屽��

+ *

+ * @param content

+ * 鏂伴椈鍐呭鏁版嵁

+ * @return

+ */

+ private int[] calSimHashValue(String content) {

+ int index;

+ long hashValue;

+ double weight;

+ int[] binaryArray;

+ int[] resultValue;

+ double[] hashArray;

+ String w;

+ String[] words;

+ News news;

+

+ news = new News(content);

+ news.statWords();

+ hashArray = new double[hashBitNum];

+ resultValue = new int[hashBitNum];

+

+ words = content.split(" ");

+ for (String str : words) {

+ index = str.indexOf('/');

+ if (index == -1) {

+ continue;

+ }

+ w = str.substring(0, index);

+

+ // 鑾峰彇鏉冮噸鍊硷紝鏍规嵁璇嶉鎵�寰�

+ weight = news.getWordFrequentValue(w);

+ if(weight == -1){

+ continue;

+ }

+ // 杩涜鍝堝笇鍊肩殑璁$畻

+ hashValue = BKDRHash(w);

+ // 鍙栦綑鎶婁綅鏁板彉涓簄浣�

+ hashValue %= Math.pow(2, hashBitNum);

+

+ // 杞负浜岃繘鍒剁殑褰㈠紡

+ binaryArray = new int[hashBitNum];

+ numToBinaryArray(binaryArray, (int) hashValue);

+

+ for (int i = 0; i < binaryArray.length; i++) {

+ // 濡傛灉姝や綅缃笂涓�1锛屽姞鏉冮噸

+ if (binaryArray[i] == 1) {

+ hashArray[i] += weight;

+ } else {

+ // 涓�0鍒欏噺鏉冮噸鎿嶄綔

+ hashArray[i] -= weight;

+ }

+ }

+ }

+

+ // 杩涜鏁扮粍鏀剁缉鎿嶄綔锛屾牴鎹�肩殑姝h礋鍙凤紝閲嶆柊鏀逛负浜岃繘鍒舵暟鎹舰寮�

+ for (int i = 0; i < hashArray.length; i++) {

+ if (hashArray[i] > 0) {

+ resultValue[i] = 1;

+ } else {

+ resultValue[i] = 0;

+ }

+ }

+

+ return resultValue;

+ }

+

+ /**

+ * 鏁板瓧杞负浜岃繘鍒跺舰寮�

+ *

+ * @param binaryArray

+ * 杞寲鍚庣殑浜岃繘鍒舵暟缁勫舰寮�

+ * @param num

+ * 寰呰浆鍖栨暟瀛�

+ */

+ private void numToBinaryArray(int[] binaryArray, int num) {

+ int index = 0;

+ int temp = 0;

+ while (num != 0) {

+ binaryArray[index] = num % 2;

+ index++;

+ num /= 2;

+ }

+

+ // 杩涜鏁扮粍鍓嶅拰灏鹃儴鐨勮皟鎹�

+ for (int i = 0; i < binaryArray.length / 2; i++) {

+ temp = binaryArray[i];

+ binaryArray[i] = binaryArray[binaryArray.length - 1 - i];

+ binaryArray[binaryArray.length - 1 - i] = temp;

+ }

+ }

+

+ /**

+ * BKDR瀛楃鍝堝笇绠楁硶

+ *

+ * @param str

+ * @return

+ */

+ public static long BKDRHash(String str) {

+ int seed = 31; /* 31 131 1313 13131 131313 etc.. */

+ long hash = 0;

+ int i = 0;

+

+ for (i = 0; i < str.length(); i++) {

+ hash = (hash * seed) + (str.charAt(i));

+ }

+

+ hash = Math.abs(hash);

+ return hash;

+ }

+

+ /**

+ * 浠庢枃浠朵腑璇诲彇鏁版嵁

+ */

+ private String readDataFile(String filePath) {

+ File file = new File(filePath);

+ StringBuilder strBuilder = null;

+

+ try {

+ BufferedReader in = new BufferedReader(new FileReader(file));

+ String str;

+ strBuilder = new StringBuilder();

+ while ((str = in.readLine()) != null) {

+ strBuilder.append(str);

+ }

+ in.close();

+ } catch (IOException e) {

+ e.getStackTrace();

+ }

+

+ return strBuilder.toString();

+ }

+

+ /**

+ * 鍒╃敤鍒嗚瘝绯荤粺杩涜鏂伴椈鍐呭鐨勫垎璇�

+ *

+ * @param srcPath

+ * 鏂伴椈鏂囦欢璺緞

+ */

+ private void parseNewsContent(String srcPath) {

+ // TODO Auto-generated method stub

+ int index;

+ String dirApi;

+ String desPath;

+

+ dirApi = System.getProperty("user.dir") + "\\lib";

+ // 缁勮杈撳嚭璺緞鍊�

+ index = srcPath.indexOf('.');

+ desPath = srcPath.substring(0, index) + "-split.txt";

+

+ try {

+ ICTCLAS50 testICTCLAS50 = new ICTCLAS50();

+ // 鍒嗚瘝鎵�闇�搴撶殑璺緞銆佸垵濮嬪寲

+ if (testICTCLAS50.ICTCLAS_Init(dirApi.getBytes("GB2312")) == false) {

+ System.out.println("Init Fail!");

+ return;

+ }

+ // 灏嗘枃浠跺悕string绫诲瀷杞负byte绫诲瀷

+ byte[] Inputfilenameb = srcPath.getBytes();

+

+ // 鍒嗚瘝澶勭悊鍚庤緭鍑烘枃浠跺悕銆佸皢鏂囦欢鍚峴tring绫诲瀷杞负byte绫诲瀷

+ byte[] Outputfilenameb = desPath.getBytes();

+

+ // 鏂囦欢鍒嗚瘝(绗竴涓弬鏁颁负杈撳叆鏂囦欢鐨勫悕,绗簩涓弬鏁颁负鏂囦欢缂栫爜绫诲瀷,绗笁涓弬鏁颁负鏄惁鏍囪璇嶆�ч泦1 yes,0

+ // no,绗洓涓弬鏁颁负杈撳嚭鏂囦欢鍚�)

+ testICTCLAS50.ICTCLAS_FileProcess(Inputfilenameb, 0, 1,

+ Outputfilenameb);

+ // 閫�鍑哄垎璇嶅櫒

+ testICTCLAS50.ICTCLAS_Exit();

+ } catch (Exception ex) {

+ ex.printStackTrace();

+ }

+

+ }

+

+}

\ No newline at end of file

diff --git a/quick-simhash/src/main/resources/IKAnalyzer.cfg.xml b/quick-simhash/src/main/resources/IKAnalyzer.cfg.xml

new file mode 100644

index 0000000..c26bc4a

--- /dev/null

+++ b/quick-simhash/src/main/resources/IKAnalyzer.cfg.xml

@@ -0,0 +1,11 @@

+

+

++IK Analyzer 鎵╁睍閰嶇疆

+

+

+stopword.dic;

+

+

\ No newline at end of file

diff --git a/quick-simhash/src/main/resources/stopword.dic b/quick-simhash/src/main/resources/stopword.dic

new file mode 100644

index 0000000..c1b994b

--- /dev/null

+++ b/quick-simhash/src/main/resources/stopword.dic

@@ -0,0 +1,33 @@

+a

+an

+and

+are

+as

+at

+be

+but

+by

+for

+if

+in

+into

+is

+it

+no

+not

+of

+on

+or

+such

+that

+the

+their

+then

+there

+these

+they

+this

+to

+was

+will

+with

\ No newline at end of file

java int数列转字符串,鍥剧墖杞瓧绗︿覆相关推荐

  1. java int to hex_Java字符串转16 进制工具类Hex.java | 学步园

    Java 字符串转 16 进制工具类 Hex.java 实现 16进制 0xfecd .. 和 java 字符串之间的互转换! 如果做开发,通常用户登陆密码都会 mad5(salt + pwd) 然后 ...

  2. 绁炵粡缃戠粶杈撳叆鍥剧墖澶у皬

    神经网络训练过程中图片像素对训练结果有什么影响,由于GPU内存太小,将224*224改成了120*120 . 有影响像素越高相对需要的网络结构更复杂优化技术更好训练时间更长超参数的设置等就好比CIFA ...

  3. Java中同时输入字符串和int类型出错的处理方式

    在Java中,如果输入int类型和字符串处理不当会产生错误,例如:如果先输入int类型,在输入字符串类型,如下代码: Scanner sc=new Scanner(System.in);int a=s ...

  4. java将int转换成字符串,Java将int转换为字符串

    我们可以使用String.valueOf()和Integer.toString()方法在Java中将int转换为String.另外,我们可以使用String.format()方法,字符串连接运算符等. ...

  5. 数据结构与算法Java(二)——字符串、矩阵压缩、递归、动态规划

    不定期补充.修正.更新:欢迎大家讨论和指正 本文以数据结构(C语言版)第三版 李云清 杨庆红编著为主要参考资料,用Java来实现 数据结构与算法Java(一)--线性表 数据结构与算法Java(二)- ...

  6. Java:判断一个字符串中是否存在另一个字符子串以及判断一个字符串中是否存在指定字符

    Java:判断一个字符串中包含指定字符子串,判断一个字符串中存在指定字符 字符串的contains方法可以判断一个字符串中是否存在另一个字符子串,示例如下 String Str = "Hel ...

  7. Java - 将整数转换为字符串[duplicate]

    本文翻译自:Java - Convert integer to string [duplicate] This question already has an answer here: 这个问题在这里 ...

  8. java判断回文字符串几种简单的实现

    11年it研发经验,从一个会计转行为算法工程师,学过C#,c++,java,android,php,go,js,python,CNN神经网络,四千多篇博文,三千多篇原创,只为与你分享,共同成长,一起进 ...

  9. java 补0_Java String字符串补0或空格

    下面是编程之家 jb51.cc 通过网络收集整理的代码片段. 编程之家小编现在分享给大家,也给大家做个参考. package cn.com.songjy; import java.text.Numbe ...

最新文章

  1. jeecg3.5.2中上传下载文件的示例中的的一个bug
  2. java直接量_Java教程:Java直接量(字面量)
  3. android 加载html6,WebView使用总结2(加载HTML内容形式的String)
  4. Saas与传统软件对比
  5. Spring State Machine:它是什么,您需要它吗?
  6. mysql事务-与pymyql的事务
  7. git 实践(二) push的使用
  8. mod sim tcp配置_ModSim32-ModScan32Modbus调试工具使用及配置说明.pdf
  9. FL计算机软件,FL Studio水果编曲软件
  10. 2022最新第四方聚合支付系统源码+详细搭建教程
  11. 遵义微红科技社群直播分销系统精选最具市场营销的功能点
  12. Unity接入ios SDK(小7手游)没有你想的那么难
  13. 迷失在森林里的小女孩
  14. Dubbo源码分析(三) -- Dubbo的服务发现源码深入解析4万字长文
  15. 关于AsyncHttpClient的cz.msebera.android.httpclient.Header
  16. 从阿里云下载centos的步骤
  17. UploadFile图片上传案例
  18. mybatis-plus视图查询
  19. Flutter企业级项目实战——博时App
  20. 极简嵌入式C语言教程——从入门到入土(2)

热门文章

  1. c++图书管理系统_轻松学做C语言课程设计:图书管理系统-数组实现
  2. html post后404,为什么在vue-cli脚手架下启动的服务器POST返回404呢?WebStorm启动没有问题?...
  3. 通过FPGA将图片信息通过RS232串口发送到PC端,使用MATLAB进行图片显示
  4. C++类与static关键字
  5. nginx 限流,以及nginx直接返回json格式数据
  6. Netflix Play API:我们为什么构建了一个演进式架构?
  7. matlab-画个拱桥和倒影?
  8. centos7 以上和以下版本设置
  9. CSS学习之多类别选择器
  10. VMware vSphere Client安装Centos7