文献[1]で紹介されているAC法をJavaで実装してみました.
AC法はテキスト内に出現する複数のキーワードを1回走査しただけで検出することができるアルゴリズムです. トライの部分はダブル配列法を使いたいところですが,コード量が多くなるので,今回はTreeMapで済ませました. テストケースの件数が少ないので,もしかするとバグっているかもしれません.
[1] | 伊藤 直也, 田中 慎司: Web開発者のための大規模サービス技術入門, 技術評論社, 2010-08 |
![]() ![]() |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.Comparator; | |
import java.util.TreeMap; | |
public class AC { | |
private Node rootNode; | |
public static AC create(String[] keywords) { | |
Arrays.sort(keywords, new Comparator<String>() { | |
@Override | |
public int compare(String keyword1, String keyword2) { | |
int result = keyword1.length() - keyword2.length(); | |
if (result == 0) { | |
result = keyword1.compareTo(keyword2); | |
} | |
return result; | |
} | |
}); | |
AC ac = new AC(); | |
ac.rootNode = new Node(); | |
ac.rootNode.failureNode = ac.rootNode; | |
for (String keyword : keywords) { | |
Node currentNode = ac.rootNode; | |
for (int i = 0; i < keyword.length(); i++) { | |
char label = keyword.charAt(i); | |
Node nextNode = ac.g(currentNode, label); | |
if (nextNode == null) { | |
nextNode = new Node(); | |
nextNode.failureNode = ac.rootNode; | |
currentNode.nextNodeMap.put(label, nextNode); | |
} | |
currentNode = nextNode; | |
} | |
currentNode.lengths.add(keyword.length()); | |
} | |
for (String keyword : keywords) { | |
Node keywordNode = ac.g(ac.rootNode, keyword); | |
for (int i = 1; i < keyword.length(); i++) { | |
String suffix = keyword.substring(i); | |
Node suffixNode = ac.g(ac.rootNode, suffix); | |
if (suffixNode != null) { | |
keywordNode.failureNode = suffixNode; | |
for (int length : suffixNode.lengths) { | |
keywordNode.lengths.add(length); | |
} | |
break; | |
} | |
} | |
} | |
return ac; | |
} | |
public Result match(String text) { | |
Result result = new Result(); | |
Node currentNode = this.rootNode; | |
int position = 0; | |
while (position < text.length()) { | |
for(int length : currentNode.lengths) { | |
Occurrence occurrence = new Occurrence(position - length, length); | |
result.occurrences.add(occurrence); | |
} | |
char label = text.charAt(position); | |
Node nextNode = this.g(currentNode, label); | |
if (nextNode == null) { | |
while (currentNode != this.rootNode) { | |
currentNode = this.f(currentNode); | |
nextNode = this.g(currentNode, label); | |
if (nextNode != null) { | |
currentNode = nextNode; | |
break; | |
} | |
} | |
} | |
else { | |
currentNode = nextNode; | |
} | |
position++; | |
} | |
return result; | |
} | |
Node g(Node node, char label) { | |
Node nextNode = node.nextNodeMap.get(label); | |
return nextNode; | |
} | |
Node g(Node node, String labels) { | |
Node currentNode = node; | |
for (int i = 0; i < labels.length(); i++) { | |
char label = labels.charAt(i); | |
Node nextNode = this.g(currentNode, label); | |
if (nextNode == null) { | |
return null; | |
} | |
currentNode = nextNode; | |
} | |
return currentNode; | |
} | |
Node f(Node currentNode) { | |
return currentNode.failureNode; | |
} | |
static class Node { | |
TreeMap<Character, Node> nextNodeMap; | |
Node failureNode; | |
ArrayList<Integer> lengths; | |
Node() { | |
this.nextNodeMap = new TreeMap<Character, Node>(); | |
this.failureNode = null; | |
this.lengths = new ArrayList<Integer>(); | |
} | |
} | |
public static class Occurrence { | |
public int position; | |
public int length; | |
public Occurrence(int position, int length) { | |
this.position = position; | |
this.length = length; | |
} | |
@Override | |
public boolean equals(Object object) { | |
if (this == object) { | |
return true; | |
} | |
if (object == null) { | |
return false; | |
} | |
if (!(object instanceof Occurrence)) { | |
return false; | |
} | |
Occurrence other = (Occurrence)object; | |
if (this.position != other.position) { | |
return false; | |
} | |
if (this.length != other.length) { | |
return false; | |
} | |
return true; | |
} | |
@Override | |
public int hashCode() { | |
final int prime = 31; | |
int result = 1; | |
result = prime * result + this.position; | |
result = prime * result + this.length; | |
return result; | |
} | |
@Override | |
public String toString() { | |
StringBuilder builder = new StringBuilder(); | |
builder.append('{'); | |
builder.append("\"position:\""); | |
builder.append(this.position); | |
builder.append(",\"length:\""); | |
builder.append(this.length); | |
builder.append('}'); | |
return builder.toString(); | |
} | |
} | |
public static class Result { | |
public ArrayList<Occurrence> occurrences; | |
public Result() { | |
this.occurrences = new ArrayList<Occurrence>(); | |
} | |
} | |
} |
import static org.hamcrest.core.IsEqual.equalTo; | |
import static org.junit.Assert.assertThat; | |
import java.util.ArrayList; | |
import java.util.List; | |
import org.junit.Test; | |
public class ACTest { | |
@Test | |
public void test() { | |
String[] keywords = new String[] {"he", "hers", "his", "she"}; | |
String text = "a his hoge hershe xx."; | |
@SuppressWarnings("serial") | |
List<AC.Occurrence> occurrences = new ArrayList<AC.Occurrence>() {{ | |
add(new AC.Occurrence( 2, 3)); | |
add(new AC.Occurrence(11, 2)); | |
add(new AC.Occurrence(11, 4)); | |
add(new AC.Occurrence(14, 3)); | |
add(new AC.Occurrence(15, 2)); | |
}}; | |
AC ac = AC.create(keywords); | |
AC.Result result = ac.match(text); | |
for (int i = 0; i < occurrences.size(); i++) { | |
AC.Occurrence expectedOccurrence = occurrences.get(i); | |
AC.Occurrence actualOccurrence = result.occurrences.get(i); | |
assertThat(expectedOccurrence, equalTo(actualOccurrence)); | |
} | |
} | |
} |