去重这个需求古已有之。然而,由于JavaScript语言的特性,导致去重这种需求既好做,又不好做。现在是ECMAScript 6时代了,还用着以前的方式,感觉也是不太对劲呢。另外,深入深入这玩意,没准还能有些别样的收获。
测试机:i3-3220 / 16G / Windows 10 Build 10166 / iojs 2.2.1 / nodejs 0.12.0
先来一段鬼畜的初始化代码:
var aryInt = [];
var aryStr = [];
var aryMix = [];
function insertToArray(i) {
aryInt.push(i);
aryStr.push(i.toString());
aryMix.push(i.toFixed(3), i, i.toString());
}
console.time("All test");
console.log("Initializing test data...");
for (var i = 1; i <= 100000; i++) {
insertToArray(i); // 普通数据
}
for (i = 2; i <= 100000; i = i * 1.01) { // 越后期越稀疏
insertToArray(parseInt(i));
}/*
console.log("Sorting test data...");
aryInt.sort();
aryStr.sort();
aryMix.sort();
*/
module.exports = {
int: aryInt,
str: aryStr,
mix2: aryMix
};
console.log("Output 10 data:");
var testString;
for (var key in module.exports) {
testString = [];
for (i = 0; i < 10; i++) {
testString.push(module.exports[key][i]);
}
console.log(key + ": " + testString.join(" "));
}
console.log("Start test!");
没错,这就是很鬼畜的数据。将其保存为array.js
然后,我们的每个文件都要加上如下代码,配上名字为unique的函数,来实现我们的自动化测试。
var array = require('./array');
var i, testString = [], newArray = [];
for (var key in array) {
console.time(key);
newArray = unique(array[key]);
console.timeEnd(key);
for (i = 0; i < 10; i++) {
testString.push((typeof newArray[i] == "string" ? '"' + newArray[i] + '"': newArray[i]));
}
console.log("10 items: " + testString.join(" "));
testString = [];
}
console.timeEnd("All test");
我们知道,(看起来)最省空间的方式也就是直接从数组内搜索,如直接双层循环,或是用indexOf.
借用玉伯大大的两个算法(https://github.com/lifesinger/lifesinger.github.com/issues/113),分别是indexOf和IE6 - 8下的indexOf,命名为indexOf.js和doubleFor.js(因为IE6下的就是两个for循环实现的)。理论时间复杂度为O(n2)。
因为耗时太长,所以我停止了mix型数据的去重。测试结果如下:
λ iojs .\indexOf
Initializing test data...
Sorting test data...
Output 10:
int: 1 2 3 4 5 6 7 8 9 10
str: 1 2 3 4 5 6 7 8 9 10
mix2: 1.000 1 1 2.000 2 2 3.000 3 3 4.000
Start test!
int: 5064ms
10 items: 1 2 3 4 5 6 7 8 9 10
str: 40610ms
10 items: "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
| λ iojs .\doubleFor
Initializing test data...
Sorting test data...
Output 10:
int: 1 2 3 4 5 6 7 8 9 10
str: 1 2 3 4 5 6 7 8 9 10
mix2: 1.000 1 1 2.000 2 2 3.000 3 3 4.000
Start test!
int: 5051ms
10 items: 1 2 3 4 5 6 7 8 9 10
str: 45762ms
10 items: "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
|
λ iojs .\indexOf
Initializing test data...
Sorting test data...
Output 10:
int: 1 2 3 4 5 6 7 8 9 10
str: 1 2 3 4 5 6 7 8 9 10
mix2: 1.000 1 1 2.000 2 2 3.000 3 3 4.000
Start test!
int: 4978ms
10 items: 1 2 3 4 5 6 7 8 9 10
str: 49288ms
10 items: "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
| λ iojs .\doubleFor
Initializing test data...
Sorting test data...
Output 10:
int: 1 2 3 4 5 6 7 8 9 10
str: 1 2 3 4 5 6 7 8 9 10
mix2: 1.000 1 1 2.000 2 2 3.000 3 3 4.000
Start test!
int: 4915ms
10 items: 1 2 3 4 5 6 7 8 9 10
str: 43357ms
10 items: "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
|
别问我为什么indexOf和doubleFor速度差不多甚至后者可能更快,我心累……
关于indexOf,ECMAScript有标准的实现规范,不另加摘录: http://www.ecma-international.org/ecma-262/6.0/#sec-array.prototype.indexof
看起来很鬼畜?实际上这坨东西的意思也只是把这个数组遍历搜索一遍,时间复杂度为O(n)。
把排序打开,把范围调小;我们可以看到,这两个的输出结果还是能完美区分数字和字符串类型。
但是这个差距是怎么回事呢?直接看看v8的代码吧:https://github.com/v8/v8/blob/master/src/array.js#L1304。 ArrayIndexOf函数的确完整实现了ECMAScript,然而其的各种限制太多,又是JavaScript所写,当然比不过我们自己实现的indexOf.
λ iojs .\indexOf
Initializing test data...
Sorting test data...
Output 10:
int: 1 10 10 10 10 10 10 10 10 10
str: 1 10 10 10 10 10 10 10 10 10
mix2: 1 1 1.000 10 10 10 10 10 10 10
Start test!
int: 4ms
10 items: 1 10 100 1000 101 102 103 104 105 106
str: 9ms
10 items: "1" "10" "100" "1000" "101" "102" "103" "104" "105" "106"
mix2: 67ms
10 items: "1" 1 "1.000" 10 "10" "10.000" 100 "100" "100.000" 1000
All test: 98ms
| λ iojs .\doubleFor
Initializing test data...
Sorting test data...
Output 10:
int: 1 10 10 10 10 10 10 10 10 10
str: 1 10 10 10 10 10 10 10 10 10
mix2: 1 1 1.000 10 10 10 10 10 10 10
Start test!
int: 2ms
10 items: 1 10 100 1000 101 102 103 104 105 106
str: 6ms
10 items: "1" "10" "100" "1000" "101" "102" "103" "104" "105" "106"
mix2: 32ms
10 items: "1" 1 "1.000" 10 "10" "10.000" 100 "100" "100.000" 1000
All test: 57ms
|
得出结论1:V8下,双层For有相当大的可能比indexOf操作符来得快。
然后,看玉伯的第三个方案,也就是Hash。我们可以再弄一个不带typeof的版本出来测试。另外,第二行是经过排序后的版本。
λ iojs .\hashWithTypeof
Initializing test data...
Output 10:
int: 1 2 3 4 5 6 7 8 9 10
str: 1 2 3 4 5 6 7 8 9 10
mix2: 1.000 1 1 2.000 2 2 3.000 3 3 4.000
Start test!
int: 123ms
10 items: 1 2 3 4 5 6 7 8 9 10
str: 163ms
10 items: "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
mix2: 392ms
10 items: "1.000" 1 "1" "2.000" 2 "2" "3.000" 3 "3" "4.000"
All test: 768ms
| λ iojs .\hashWithoutTypeof
Initializing test data...
Output 10:
int: 1 2 3 4 5 6 7 8 9 10
str: 1 2 3 4 5 6 7 8 9 10
mix2: 1.000 1 1 2.000 2 2 3.000 3 3 4.000
Start test!
int: 26ms
10 items: 1 2 3 4 5 6 7 8 9 10
str: 41ms
10 items: "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
mix2: 135ms
10 items: "1.000" 1 "2.000" 2 "3.000" 3 "4.000" 4 "5.000" 5
All test: 289ms
|
λ iojs .\hashWithTypeof
Initializing test data...
Sorting test data...
Output 10:
int: 1 10 10 10 10 10 10 10 10 10
str: 1 10 10 10 10 10 10 10 10 10
mix2: 1 1 1.000 10 10 10 10 10 10 10
Start test!
int: 145ms
10 items: 1 10 100 1000 10000 100000 10001 10002 10003 10004
str: 138ms
10 items: "1" "10" "100" "1000" "10000" "100000" "10001" "10002" "10003" "10004"
mix2: 440ms
10 items: 1 "1" "1.000" 10 "10" "10.000" "100" 100 "100.000" "1000"
All test: 1323ms
| λ iojs .\hashWithoutTypeof
Initializing test data...
Sorting test data...
Output 10:
int: 1 10 10 10 10 10 10 10 10 10
str: 1 10 10 10 10 10 10 10 10 10
mix2: 1 1 1.000 10 10 10 10 10 10 10
Start test!
int: 37ms
10 items: 1 10 100 1000 10000 100000 10001 10002 10003 10004
str: 62ms
10 items: "1" "10" "100" "1000" "10000" "100000" "10001" "10002" "10003" "10004"
mix2: 168ms
10 items: 1 "1.000" 10 "10.000" "100" "100.000" "1000" "1000.000" 10000 "10000.000"
All test: 888ms
|
然而,typeof的性能如此低下,让人简直不敢直接用。更何况,其甚至还有对于new String这样的Object返回的仍然是Object的问题……然而,其实还有更慢的方法:把typeof换成
Object.prototype.toString.call(item)
这样的话,性能测试比左侧typeof得分还低……
得出结论2:V8下,尽量少判断某个元素的类型。毕竟是动态语言……
推论:如果对类型要求比较强,还是用TypeScript会更为方便……
再往下深究吧。
V8里,一个数组使用的对象是Map(见Genesis::InstallInternalArray),不过内部实现是什么数据结构就不知道了=_= ,Map对象使用的(似乎)是HashTable。我们用Map对象来替代那个hash数组吧:
var unique = function(object) {
var map = new Map();
var newArray = [];
object.forEach(function(value) {
if (!map.has(value)) {
map.set(value, 1);
}
})
for (var key of map.keys()) { // 别问我为什么不用return [...map.keys()]……
newArray.push(key);
}
return newArray;
};
λ iojs .\map
Initializing test data...
Sorting test data...
Output 10:
int: 1 2 3 4 5 6 7 8 9 10
str: 1 2 3 4 5 6 7 8 9 10
mix2: 1.000 1 1 2.000 2 2 3.000 3 3 4.000
Start test!
int: 67ms
10 items: 1 2 3 4 5 6 7 8 9 10
str: 59ms
10 items: "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
mix2: 267ms
10 items: "1.000" 1 "1" "2.000" 2 "2" "3.000" 3 "3" "4.000"
All test: 482ms
| λ iojs .\map
Initializing test data...
Sorting test data...
Output 10:
int: 1 10 10 10 10 10 10 10 10 10
str: 1 10 10 10 10 10 10 10 10 10
mix2: 1 1 1.000 10 10 10 10 10 10 10
Start test!
int: 80ms
10 items: 1 10 100 1000 10000 100000 10001 10002 10003 10004
str: 48ms
10 items: "1" "10" "100" "1000" "10000" "100000" "10001" "10002" "10003" "10004"
mix2: 209ms
10 items: 1 "1" "1.000" 10 "10" "10.000" "100" 100 "100.000" "1000"
All test: 921ms
|
左边未排序,右边已排序。
……等一下,我有病啊,为什么不用Set?
Set在ECMAScript规范中如是说:
Set objects are collections of ECMAScript language values. A distinct value may only occur once as an element of a Set’s collection. Distinct values are discriminated using the SameValueZero comparison algorithm.
于是……
var unique = function(object) {
var newArray = [];
var set = new Set(object);
for (var key of set.keys()) { // 也别问我为什么不用[v for (v of xxx)]
newArray.push(key);
}
return newArray;
};
结果:
λ iojs .\set
Initializing test data...
Sorting test data...
Output 10:
int: 1 2 3 4 5 6 7 8 9 10
str: 1 2 3 4 5 6 7 8 9 10
mix2: 1.000 1 1 2.000 2 2 3.000 3 3 4.000
Start test!
int: 68ms
10 items: 1 2 3 4 5 6 7 8 9 10
str: 52ms
10 items: "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
mix2: 185ms
10 items: "1.000" 1 "1" "2.000" 2 "2" "3.000" 3 "3" "4.000"
All test: 389ms
| λ iojs .\set
Initializing test data...
Sorting test data...
Output 10:
int: 1 10 10 10 10 10 10 10 10 10
str: 1 10 10 10 10 10 10 10 10 10
mix2: 1 1 1.000 10 10 10 10 10 10 10
Start test!
int: 70ms
10 items: 1 10 100 1000 10000 100000 10001 10002 10003 10004
str: 51ms
10 items: "1" "10" "100" "1000" "10000" "100000" "10001" "10002" "10003" "10004"
mix2: 184ms
10 items: 1 "1" "1.000" 10 "10" "10.000" "100" 100 "100.000" "1000"
All test: 892ms
|
嗯……
于是,这篇文章到底想说什么呢……论科技进步对生产力的提高吗……