从JavaScript数组去重谈起

去重这个需求古已有之。然而，由于JavaScript语言的特性，导致去重这种需求既好做，又不好做。现在是ECMAScript 6时代了，还用着以前的方式，感觉也是不太对劲呢。另外，深入深入这玩意，没准还能有些别样的收获。

测试机：i3-3220 / 16G / Windows 10 Build 10166 / iojs 2.2.1 / nodejs 0.12.0

先来一段鬼畜的初始化代码：

var aryInt = [];

var aryStr = [];

var aryMix = [];



function insertToArray(i) {

	aryInt.push(i);

	aryStr.push(i.toString());

	aryMix.push(i.toFixed(3), i, i.toString());

}



console.time("All test");

console.log("Initializing test data...");

for (var i = 1; i <= 100000; i++) {

	insertToArray(i); // 普通数据

}

for (i = 2; i <= 100000; i = i * 1.01) { // 越后期越稀疏

	insertToArray(parseInt(i));

}/*

console.log("Sorting test data...");

aryInt.sort();

aryStr.sort();

aryMix.sort();

*/

module.exports = {

	int: aryInt,

	str: aryStr,

	mix2: aryMix

};



console.log("Output 10 data:");

var testString;

for (var key in module.exports) {

	testString = [];

	for (i = 0; i < 10; i++) {

		testString.push(module.exports[key][i]);

	}

	console.log(key + ": " + testString.join(" "));

}

console.log("Start test!");

没错，这就是很鬼畜的数据。将其保存为array.js

然后，我们的每个文件都要加上如下代码，配上名字为unique的函数，来实现我们的自动化测试。

var array = require('./array');

var i, testString = [], newArray = [];

for (var key in array) {

	console.time(key);

	newArray = unique(array[key]);

	console.timeEnd(key);



	for (i = 0; i < 10; i++) {

		testString.push((typeof newArray[i] == "string" ? '"' + newArray[i] + '"': newArray[i]));

	}

	console.log("10 items: " + testString.join(" "));

	testString = [];

}

console.timeEnd("All test");

我们知道，（看起来）最省空间的方式也就是直接从数组内搜索，如直接双层循环，或是用indexOf.

借用玉伯大大的两个算法（https://github.com/lifesinger/lifesinger.github.com/issues/113），分别是indexOf和IE6 - 8下的indexOf，命名为indexOf.js和doubleFor.js（因为IE6下的就是两个for循环实现的）。理论时间复杂度为O(n²)。

因为耗时太长，所以我停止了mix型数据的去重。测试结果如下：

λ iojs .\indexOf

Initializing test data...

Sorting test data...

Output 10:

int: 1 2 3 4 5 6 7 8 9 10

str: 1 2 3 4 5 6 7 8 9 10

mix2: 1.000 1 1 2.000 2 2 3.000 3 3 4.000

Start test!

int: 5064ms

10 items: 1 2 3 4 5 6 7 8 9 10

str: 40610ms

10 items: "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"

λ iojs .\doubleFor

Initializing test data...

Sorting test data...

Output 10:

int: 1 2 3 4 5 6 7 8 9 10

str: 1 2 3 4 5 6 7 8 9 10

mix2: 1.000 1 1 2.000 2 2 3.000 3 3 4.000

Start test!

int: 5051ms

10 items: 1 2 3 4 5 6 7 8 9 10

str: 45762ms

10 items: "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"

λ iojs .\indexOf

Initializing test data...

Sorting test data...

Output 10:

int: 1 2 3 4 5 6 7 8 9 10

str: 1 2 3 4 5 6 7 8 9 10

mix2: 1.000 1 1 2.000 2 2 3.000 3 3 4.000

Start test!

int: 4978ms

10 items: 1 2 3 4 5 6 7 8 9 10

str: 49288ms

10 items: "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"

λ iojs .\doubleFor

Initializing test data...

Sorting test data...

Output 10:

int: 1 2 3 4 5 6 7 8 9 10

str: 1 2 3 4 5 6 7 8 9 10

mix2: 1.000 1 1 2.000 2 2 3.000 3 3 4.000

Start test!

int: 4915ms

10 items: 1 2 3 4 5 6 7 8 9 10

str: 43357ms

10 items: "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"

别问我为什么indexOf和doubleFor速度差不多甚至后者可能更快，我心累……

关于indexOf，ECMAScript有标准的实现规范，不另加摘录： http://www.ecma-international.org/ecma-262/6.0/#sec-array.prototype.indexof

看起来很鬼畜？实际上这坨东西的意思也只是把这个数组遍历搜索一遍，时间复杂度为O(n)。

把排序打开，把范围调小；我们可以看到，这两个的输出结果还是能完美区分数字和字符串类型。

但是这个差距是怎么回事呢？直接看看v8的代码吧：https://github.com/v8/v8/blob/master/src/array.js#L1304。 ArrayIndexOf函数的确完整实现了ECMAScript，然而其的各种限制太多，又是JavaScript所写，当然比不过我们自己实现的indexOf.

λ iojs .\indexOf

Initializing test data...

Sorting test data...

Output 10:

int: 1 10 10 10 10 10 10 10 10 10

str: 1 10 10 10 10 10 10 10 10 10

mix2: 1 1 1.000 10 10 10 10 10 10 10

Start test!

int: 4ms

10 items: 1 10 100 1000 101 102 103 104 105 106

str: 9ms

10 items: "1" "10" "100" "1000" "101" "102" "103" "104" "105" "106"

mix2: 67ms

10 items: "1" 1 "1.000" 10 "10" "10.000" 100 "100" "100.000" 1000

All test: 98ms

λ iojs .\doubleFor

Initializing test data...

Sorting test data...

Output 10:

int: 1 10 10 10 10 10 10 10 10 10

str: 1 10 10 10 10 10 10 10 10 10

mix2: 1 1 1.000 10 10 10 10 10 10 10

Start test!

int: 2ms

10 items: 1 10 100 1000 101 102 103 104 105 106

str: 6ms

10 items: "1" "10" "100" "1000" "101" "102" "103" "104" "105" "106"

mix2: 32ms

10 items: "1" 1 "1.000" 10 "10" "10.000" 100 "100" "100.000" 1000

All test: 57ms

得出结论1：V8下，双层For有相当大的可能比indexOf操作符来得快。

然后，看玉伯的第三个方案，也就是Hash。我们可以再弄一个不带typeof的版本出来测试。另外，第二行是经过排序后的版本。

λ iojs .\hashWithTypeof

Initializing test data...

Output 10:

int: 1 2 3 4 5 6 7 8 9 10

str: 1 2 3 4 5 6 7 8 9 10

mix2: 1.000 1 1 2.000 2 2 3.000 3 3 4.000

Start test!

int: 123ms

10 items: 1 2 3 4 5 6 7 8 9 10

str: 163ms

10 items: "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"

mix2: 392ms

10 items: "1.000" 1 "1" "2.000" 2 "2" "3.000" 3 "3" "4.000"

All test: 768ms

λ iojs .\hashWithoutTypeof

Initializing test data...

Output 10:

int: 1 2 3 4 5 6 7 8 9 10

str: 1 2 3 4 5 6 7 8 9 10

mix2: 1.000 1 1 2.000 2 2 3.000 3 3 4.000

Start test!

int: 26ms

10 items: 1 2 3 4 5 6 7 8 9 10

str: 41ms

10 items: "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"

mix2: 135ms

10 items: "1.000" 1 "2.000" 2 "3.000" 3 "4.000" 4 "5.000" 5

All test: 289ms

λ iojs .\hashWithTypeof

Initializing test data...

Sorting test data...

Output 10:

int: 1 10 10 10 10 10 10 10 10 10

str: 1 10 10 10 10 10 10 10 10 10

mix2: 1 1 1.000 10 10 10 10 10 10 10

Start test!

int: 145ms

10 items: 1 10 100 1000 10000 100000 10001 10002 10003 10004

str: 138ms

10 items: "1" "10" "100" "1000" "10000" "100000" "10001" "10002" "10003" "10004"

mix2: 440ms

10 items: 1 "1" "1.000" 10 "10" "10.000" "100" 100 "100.000" "1000"

All test: 1323ms

λ iojs .\hashWithoutTypeof

Initializing test data...

Sorting test data...

Output 10:

int: 1 10 10 10 10 10 10 10 10 10

str: 1 10 10 10 10 10 10 10 10 10

mix2: 1 1 1.000 10 10 10 10 10 10 10

Start test!

int: 37ms

10 items: 1 10 100 1000 10000 100000 10001 10002 10003 10004

str: 62ms

10 items: "1" "10" "100" "1000" "10000" "100000" "10001" "10002" "10003" "10004"

mix2: 168ms

10 items: 1 "1.000" 10 "10.000" "100" "100.000" "1000" "1000.000" 10000 "10000.000"

All test: 888ms

然而，typeof的性能如此低下，让人简直不敢直接用。更何况，其甚至还有对于new String这样的Object返回的仍然是Object的问题……然而，其实还有更慢的方法：把typeof换成

Object.prototype.toString.call(item)

这样的话，性能测试比左侧typeof得分还低……

得出结论2：V8下，尽量少判断某个元素的类型。毕竟是动态语言……

推论：如果对类型要求比较强，还是用TypeScript会更为方便……

再往下深究吧。

V8里，一个数组使用的对象是Map（见Genesis::InstallInternalArray），不过内部实现是什么数据结构就不知道了=_= ，Map对象使用的（似乎）是HashTable。我们用Map对象来替代那个hash数组吧：

var unique = function(object) {

	var map = new Map();

	var newArray = [];

	object.forEach(function(value) {

		if (!map.has(value)) {

			map.set(value, 1);

		}

	})

	for (var key of map.keys()) { // 别问我为什么不用return [...map.keys()]……

		newArray.push(key);

	}

	return newArray;

};

λ iojs .\map

Initializing test data...

Sorting test data...

Output 10:

int: 1 2 3 4 5 6 7 8 9 10

str: 1 2 3 4 5 6 7 8 9 10

mix2: 1.000 1 1 2.000 2 2 3.000 3 3 4.000

Start test!

int: 67ms

10 items: 1 2 3 4 5 6 7 8 9 10

str: 59ms

10 items: "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"

mix2: 267ms

10 items: "1.000" 1 "1" "2.000" 2 "2" "3.000" 3 "3" "4.000"

All test: 482ms

λ iojs .\map

Initializing test data...

Sorting test data...

Output 10:

int: 1 10 10 10 10 10 10 10 10 10

str: 1 10 10 10 10 10 10 10 10 10

mix2: 1 1 1.000 10 10 10 10 10 10 10

Start test!

int: 80ms

10 items: 1 10 100 1000 10000 100000 10001 10002 10003 10004

str: 48ms

10 items: "1" "10" "100" "1000" "10000" "100000" "10001" "10002" "10003" "10004"

mix2: 209ms

10 items: 1 "1" "1.000" 10 "10" "10.000" "100" 100 "100.000" "1000"

All test: 921ms

左边未排序，右边已排序。

……等一下，我有病啊，为什么不用Set？

Set在ECMAScript规范中如是说：

Set objects are collections of ECMAScript language values. A distinct value may only occur once as an element of a Set’s collection. Distinct values are discriminated using the SameValueZero comparison algorithm.

于是……

var unique = function(object) {

	var newArray = [];

	var set = new Set(object);

	for (var key of set.keys()) { // 也别问我为什么不用[v for (v of xxx)]

		newArray.push(key);

	}

	return newArray;

};

结果：

λ iojs .\set

Initializing test data...

Sorting test data...

Output 10:

int: 1 2 3 4 5 6 7 8 9 10

str: 1 2 3 4 5 6 7 8 9 10

mix2: 1.000 1 1 2.000 2 2 3.000 3 3 4.000

Start test!

int: 68ms

10 items: 1 2 3 4 5 6 7 8 9 10

str: 52ms

10 items: "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"

mix2: 185ms

10 items: "1.000" 1 "1" "2.000" 2 "2" "3.000" 3 "3" "4.000"

All test: 389ms

λ iojs .\set

Initializing test data...

Sorting test data...

Output 10:

int: 1 10 10 10 10 10 10 10 10 10

str: 1 10 10 10 10 10 10 10 10 10

mix2: 1 1 1.000 10 10 10 10 10 10 10

Start test!

int: 70ms

10 items: 1 10 100 1000 10000 100000 10001 10002 10003 10004

str: 51ms

10 items: "1" "10" "100" "1000" "10000" "100000" "10001" "10002" "10003" "10004"

mix2: 184ms

10 items: 1 "1" "1.000" 10 "10" "10.000" "100" 100 "100.000" "1000"

All test: 892ms

嗯……

于是，这篇文章到底想说什么呢……论科技进步对生产力的提高吗……