2013年11月28日木曜日

AC法をJavaで実装

文献[1]で紹介されているAC法をJavaで実装してみました.

AC法はテキスト内に出現する複数のキーワードを1回走査しただけで検出することができるアルゴリズムです. トライの部分はダブル配列法を使いたいところですが,コード量が多くなるので,今回はTreeMapで済ませました. テストケースの件数が少ないので,もしかするとバグっているかもしれません.

[1] 伊藤 直也, 田中 慎司: Web開発者のための大規模サービス技術入門, 技術評論社, 2010-08

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.TreeMap;
public class AC {
private Node rootNode;
public static AC create(String[] keywords) {
Arrays.sort(keywords, new Comparator<String>() {
@Override
public int compare(String keyword1, String keyword2) {
int result = keyword1.length() - keyword2.length();
if (result == 0) {
result = keyword1.compareTo(keyword2);
}
return result;
}
});
AC ac = new AC();
ac.rootNode = new Node();
ac.rootNode.failureNode = ac.rootNode;
for (String keyword : keywords) {
Node currentNode = ac.rootNode;
for (int i = 0; i < keyword.length(); i++) {
char label = keyword.charAt(i);
Node nextNode = ac.g(currentNode, label);
if (nextNode == null) {
nextNode = new Node();
nextNode.failureNode = ac.rootNode;
currentNode.nextNodeMap.put(label, nextNode);
}
currentNode = nextNode;
}
currentNode.lengths.add(keyword.length());
}
for (String keyword : keywords) {
Node keywordNode = ac.g(ac.rootNode, keyword);
for (int i = 1; i < keyword.length(); i++) {
String suffix = keyword.substring(i);
Node suffixNode = ac.g(ac.rootNode, suffix);
if (suffixNode != null) {
keywordNode.failureNode = suffixNode;
for (int length : suffixNode.lengths) {
keywordNode.lengths.add(length);
}
break;
}
}
}
return ac;
}
public Result match(String text) {
Result result = new Result();
Node currentNode = this.rootNode;
int position = 0;
while (position < text.length()) {
for(int length : currentNode.lengths) {
Occurrence occurrence = new Occurrence(position - length, length);
result.occurrences.add(occurrence);
}
char label = text.charAt(position);
Node nextNode = this.g(currentNode, label);
if (nextNode == null) {
while (currentNode != this.rootNode) {
currentNode = this.f(currentNode);
nextNode = this.g(currentNode, label);
if (nextNode != null) {
currentNode = nextNode;
break;
}
}
}
else {
currentNode = nextNode;
}
position++;
}
return result;
}
Node g(Node node, char label) {
Node nextNode = node.nextNodeMap.get(label);
return nextNode;
}
Node g(Node node, String labels) {
Node currentNode = node;
for (int i = 0; i < labels.length(); i++) {
char label = labels.charAt(i);
Node nextNode = this.g(currentNode, label);
if (nextNode == null) {
return null;
}
currentNode = nextNode;
}
return currentNode;
}
Node f(Node currentNode) {
return currentNode.failureNode;
}
static class Node {
TreeMap<Character, Node> nextNodeMap;
Node failureNode;
ArrayList<Integer> lengths;
Node() {
this.nextNodeMap = new TreeMap<Character, Node>();
this.failureNode = null;
this.lengths = new ArrayList<Integer>();
}
}
public static class Occurrence {
public int position;
public int length;
public Occurrence(int position, int length) {
this.position = position;
this.length = length;
}
@Override
public boolean equals(Object object) {
if (this == object) {
return true;
}
if (object == null) {
return false;
}
if (!(object instanceof Occurrence)) {
return false;
}
Occurrence other = (Occurrence)object;
if (this.position != other.position) {
return false;
}
if (this.length != other.length) {
return false;
}
return true;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + this.position;
result = prime * result + this.length;
return result;
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append('{');
builder.append("\"position:\"");
builder.append(this.position);
builder.append(",\"length:\"");
builder.append(this.length);
builder.append('}');
return builder.toString();
}
}
public static class Result {
public ArrayList<Occurrence> occurrences;
public Result() {
this.occurrences = new ArrayList<Occurrence>();
}
}
}
view raw AC.java hosted with ❤ by GitHub
import static org.hamcrest.core.IsEqual.equalTo;
import static org.junit.Assert.assertThat;
import java.util.ArrayList;
import java.util.List;
import org.junit.Test;
public class ACTest {
@Test
public void test() {
String[] keywords = new String[] {"he", "hers", "his", "she"};
String text = "a his hoge hershe xx.";
@SuppressWarnings("serial")
List<AC.Occurrence> occurrences = new ArrayList<AC.Occurrence>() {{
add(new AC.Occurrence( 2, 3));
add(new AC.Occurrence(11, 2));
add(new AC.Occurrence(11, 4));
add(new AC.Occurrence(14, 3));
add(new AC.Occurrence(15, 2));
}};
AC ac = AC.create(keywords);
AC.Result result = ac.match(text);
for (int i = 0; i < occurrences.size(); i++) {
AC.Occurrence expectedOccurrence = occurrences.get(i);
AC.Occurrence actualOccurrence = result.occurrences.get(i);
assertThat(expectedOccurrence, equalTo(actualOccurrence));
}
}
}
view raw ACTest.java hosted with ❤ by GitHub

2013年11月17日日曜日

VB CodeをJavaで実装

文献[1]で紹介されているVB CodeをJavaで実装してみました.

また,圧縮の効果も調べてみました. 以下がその結果(一例)です. 140万エントリのID列を想定してランダムに生成した整数列をギャップ列に直して,VC Codeでエンコードしています. 元の整数列が密になる(整数列の長さが増す)ほど,圧縮率が良くなっています. 密になればなるほど,整数間のギャップが小さくなるので,期待通りの結果になりました.

整数列長圧縮前(B)圧縮後(B)圧縮率
1441.00
1040320.80
1004002600.65
1000400019550.49
1000040000140790.35

[1] 伊藤 直也, 田中 慎司: Web開発者のための大規模サービス技術入門, 技術評論社, 2010-08

import java.io.ByteArrayOutputStream;
import java.util.ArrayList;
import java.util.List;
public class VBCode {
public static byte[] encode(List<Integer> numbers) {
ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
byte[] bytes = new byte[5];
for (int number : numbers) {
int offset = 4;
while (true) {
bytes[offset] = (byte)(number % 128);
if (number < 128) {
break;
}
number = number / 128;
offset--;
}
bytes[4] = (byte)(bytes[4] | 0x80);
byteStream.write(bytes, offset, bytes.length - offset);
}
return byteStream.toByteArray();
}
public static List<Integer> decode(byte[] bytes) {
ArrayList<Integer> numbers = new ArrayList<Integer>();
int number = 0;
for (int i = 0; i < bytes.length; i++) {
if (bytes[i] >= 0) {
number = 128 * number + bytes[i];
}
else {
number = 128 * number + (bytes[i] & 0x7F);
numbers.add(number);
number = 0;
}
}
return numbers;
}
}
view raw VBCode.java hosted with ❤ by GitHub
import static org.hamcrest.CoreMatchers.equalTo;
import static org.junit.Assert.assertThat;
import java.util.ArrayList;
import java.util.List;
import org.junit.Test;
public class VBCodeTest {
private static final int MAXVALUE = 14000000;
@Test
public void test() {
int[] lengthes = {1, 10, 100, 1000, 10000, 100000};
System.out.println("Length\tNumbers Size\tEncoded Numbers Size\tCompression Rate");
for(int length : lengthes) {
List<Integer> numbers = createNumbers(1, MAXVALUE, length);
//System.out.println(toCSV(numbers));
List<Integer> gaps = createGaps(numbers);
//System.out.println(toCSV(gaps));
byte[] encodedGaps = VBCode.encode(gaps);
//System.out.println(toCSV(encodedGaps));
List<Integer> decodedGaps = VBCode.decode(encodedGaps);
//System.out.println(toCSV(decodedGaps));
assertThat(decodedGaps, equalTo(gaps));
int numbersSize = Integer.SIZE / Byte.SIZE * numbers.size();
System.out.print(length);
System.out.print("\t");
System.out.print(numbersSize);
System.out.print("\t");
System.out.print(encodedGaps.length);
System.out.print("\t");
System.out.print((double)encodedGaps.length / numbersSize);
System.out.println();
}
}
class Pair {
int lower;
int upper;
Pair(int lower, int upper) {
this.lower = lower;
this.upper = upper;
}
}
List<Integer> createNumbers(int lower, int upper, int length) {
ArrayList<Integer> numbers = new ArrayList<Integer>();
ArrayList<Pair> pairs = new ArrayList<Pair>();
pairs.add(new Pair(lower, upper));
LOOP:
while (true) {
ArrayList<Pair> nextPairs = new ArrayList<Pair>();
for (Pair pair : pairs) {
if (pair.lower == pair.upper) {
nextPairs.add(pair);
}
else {
int middle = pair.lower + (int)Math.floor(Math.random() * (pair.upper - pair.lower));
nextPairs.add(new Pair(pair.lower, middle));
if (nextPairs.size() == length) {
pairs = nextPairs;
break LOOP;
}
nextPairs.add(new Pair(middle + 1, pair.upper));
if (nextPairs.size() == length) {
pairs = nextPairs;
break LOOP;
}
}
}
pairs = nextPairs;
}
for (Pair pair : pairs) {
int number = pair.lower + (int)Math.floor(Math.random() * (pair.upper - pair.lower));
numbers.add(number);
}
return numbers;
}
List<Integer> createGaps(List<Integer> numbers) {
List<Integer> gaps = new ArrayList<Integer>(numbers.size());
gaps.add(numbers.get(0));
for(int i = 1; i < numbers.size(); i++) {
gaps.add(numbers.get(i) - numbers.get(i - 1));
}
return gaps;
}
static int asUint(byte b) {
return b & 0xFF;
}
static String toCSV(int[] array) {
StringBuilder builder = new StringBuilder();
if (array.length > 0) {
builder.append(array[0]);
}
for (int i = 1; i < array.length; i++) {
builder.append(",");
builder.append(array[i]);
}
return builder.toString();
}
static String toCSV(byte[] array) {
StringBuilder builder = new StringBuilder();
if (array.length > 0) {
builder.append(asUint(array[0]));
}
for (int i = 1; i < array.length; i++) {
builder.append(",");
builder.append(asUint(array[i]));
}
return builder.toString();
}
}
view raw VBCodeTest.java hosted with ❤ by GitHub