2013年11月28日木曜日

AC法をJavaで実装

文献[1]で紹介されているAC法をJavaで実装してみました.

AC法はテキスト内に出現する複数のキーワードを1回走査しただけで検出することができるアルゴリズムです. トライの部分はダブル配列法を使いたいところですが,コード量が多くなるので,今回はTreeMapで済ませました. テストケースの件数が少ないので,もしかするとバグっているかもしれません.

[1] 伊藤 直也, 田中 慎司: Web開発者のための大規模サービス技術入門, 技術評論社, 2010-08

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.TreeMap;
public class AC {
private Node rootNode;
public static AC create(String[] keywords) {
Arrays.sort(keywords, new Comparator<String>() {
@Override
public int compare(String keyword1, String keyword2) {
int result = keyword1.length() - keyword2.length();
if (result == 0) {
result = keyword1.compareTo(keyword2);
}
return result;
}
});
AC ac = new AC();
ac.rootNode = new Node();
ac.rootNode.failureNode = ac.rootNode;
for (String keyword : keywords) {
Node currentNode = ac.rootNode;
for (int i = 0; i < keyword.length(); i++) {
char label = keyword.charAt(i);
Node nextNode = ac.g(currentNode, label);
if (nextNode == null) {
nextNode = new Node();
nextNode.failureNode = ac.rootNode;
currentNode.nextNodeMap.put(label, nextNode);
}
currentNode = nextNode;
}
currentNode.lengths.add(keyword.length());
}
for (String keyword : keywords) {
Node keywordNode = ac.g(ac.rootNode, keyword);
for (int i = 1; i < keyword.length(); i++) {
String suffix = keyword.substring(i);
Node suffixNode = ac.g(ac.rootNode, suffix);
if (suffixNode != null) {
keywordNode.failureNode = suffixNode;
for (int length : suffixNode.lengths) {
keywordNode.lengths.add(length);
}
break;
}
}
}
return ac;
}
public Result match(String text) {
Result result = new Result();
Node currentNode = this.rootNode;
int position = 0;
while (position < text.length()) {
for(int length : currentNode.lengths) {
Occurrence occurrence = new Occurrence(position - length, length);
result.occurrences.add(occurrence);
}
char label = text.charAt(position);
Node nextNode = this.g(currentNode, label);
if (nextNode == null) {
while (currentNode != this.rootNode) {
currentNode = this.f(currentNode);
nextNode = this.g(currentNode, label);
if (nextNode != null) {
currentNode = nextNode;
break;
}
}
}
else {
currentNode = nextNode;
}
position++;
}
return result;
}
Node g(Node node, char label) {
Node nextNode = node.nextNodeMap.get(label);
return nextNode;
}
Node g(Node node, String labels) {
Node currentNode = node;
for (int i = 0; i < labels.length(); i++) {
char label = labels.charAt(i);
Node nextNode = this.g(currentNode, label);
if (nextNode == null) {
return null;
}
currentNode = nextNode;
}
return currentNode;
}
Node f(Node currentNode) {
return currentNode.failureNode;
}
static class Node {
TreeMap<Character, Node> nextNodeMap;
Node failureNode;
ArrayList<Integer> lengths;
Node() {
this.nextNodeMap = new TreeMap<Character, Node>();
this.failureNode = null;
this.lengths = new ArrayList<Integer>();
}
}
public static class Occurrence {
public int position;
public int length;
public Occurrence(int position, int length) {
this.position = position;
this.length = length;
}
@Override
public boolean equals(Object object) {
if (this == object) {
return true;
}
if (object == null) {
return false;
}
if (!(object instanceof Occurrence)) {
return false;
}
Occurrence other = (Occurrence)object;
if (this.position != other.position) {
return false;
}
if (this.length != other.length) {
return false;
}
return true;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + this.position;
result = prime * result + this.length;
return result;
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append('{');
builder.append("\"position:\"");
builder.append(this.position);
builder.append(",\"length:\"");
builder.append(this.length);
builder.append('}');
return builder.toString();
}
}
public static class Result {
public ArrayList<Occurrence> occurrences;
public Result() {
this.occurrences = new ArrayList<Occurrence>();
}
}
}
view raw AC.java hosted with ❤ by GitHub
import static org.hamcrest.core.IsEqual.equalTo;
import static org.junit.Assert.assertThat;
import java.util.ArrayList;
import java.util.List;
import org.junit.Test;
public class ACTest {
@Test
public void test() {
String[] keywords = new String[] {"he", "hers", "his", "she"};
String text = "a his hoge hershe xx.";
@SuppressWarnings("serial")
List<AC.Occurrence> occurrences = new ArrayList<AC.Occurrence>() {{
add(new AC.Occurrence( 2, 3));
add(new AC.Occurrence(11, 2));
add(new AC.Occurrence(11, 4));
add(new AC.Occurrence(14, 3));
add(new AC.Occurrence(15, 2));
}};
AC ac = AC.create(keywords);
AC.Result result = ac.match(text);
for (int i = 0; i < occurrences.size(); i++) {
AC.Occurrence expectedOccurrence = occurrences.get(i);
AC.Occurrence actualOccurrence = result.occurrences.get(i);
assertThat(expectedOccurrence, equalTo(actualOccurrence));
}
}
}
view raw ACTest.java hosted with ❤ by GitHub

0 件のコメント: