文献[1]で紹介されているAC法をJavaで実装してみました.
AC法はテキスト内に出現する複数のキーワードを1回走査しただけで検出することができるアルゴリズムです. トライの部分はダブル配列法を使いたいところですが,コード量が多くなるので,今回はTreeMapで済ませました. テストケースの件数が少ないので,もしかするとバグっているかもしれません.
[1] | 伊藤 直也, 田中 慎司: Web開発者のための大規模サービス技術入門, 技術評論社, 2010-08 |
![]() ![]() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.Comparator; | |
import java.util.TreeMap; | |
public class AC { | |
private Node rootNode; | |
public static AC create(String[] keywords) { | |
Arrays.sort(keywords, new Comparator<String>() { | |
@Override | |
public int compare(String keyword1, String keyword2) { | |
int result = keyword1.length() - keyword2.length(); | |
if (result == 0) { | |
result = keyword1.compareTo(keyword2); | |
} | |
return result; | |
} | |
}); | |
AC ac = new AC(); | |
ac.rootNode = new Node(); | |
ac.rootNode.failureNode = ac.rootNode; | |
for (String keyword : keywords) { | |
Node currentNode = ac.rootNode; | |
for (int i = 0; i < keyword.length(); i++) { | |
char label = keyword.charAt(i); | |
Node nextNode = ac.g(currentNode, label); | |
if (nextNode == null) { | |
nextNode = new Node(); | |
nextNode.failureNode = ac.rootNode; | |
currentNode.nextNodeMap.put(label, nextNode); | |
} | |
currentNode = nextNode; | |
} | |
currentNode.lengths.add(keyword.length()); | |
} | |
for (String keyword : keywords) { | |
Node keywordNode = ac.g(ac.rootNode, keyword); | |
for (int i = 1; i < keyword.length(); i++) { | |
String suffix = keyword.substring(i); | |
Node suffixNode = ac.g(ac.rootNode, suffix); | |
if (suffixNode != null) { | |
keywordNode.failureNode = suffixNode; | |
for (int length : suffixNode.lengths) { | |
keywordNode.lengths.add(length); | |
} | |
break; | |
} | |
} | |
} | |
return ac; | |
} | |
public Result match(String text) { | |
Result result = new Result(); | |
Node currentNode = this.rootNode; | |
int position = 0; | |
while (position < text.length()) { | |
for(int length : currentNode.lengths) { | |
Occurrence occurrence = new Occurrence(position - length, length); | |
result.occurrences.add(occurrence); | |
} | |
char label = text.charAt(position); | |
Node nextNode = this.g(currentNode, label); | |
if (nextNode == null) { | |
while (currentNode != this.rootNode) { | |
currentNode = this.f(currentNode); | |
nextNode = this.g(currentNode, label); | |
if (nextNode != null) { | |
currentNode = nextNode; | |
break; | |
} | |
} | |
} | |
else { | |
currentNode = nextNode; | |
} | |
position++; | |
} | |
return result; | |
} | |
Node g(Node node, char label) { | |
Node nextNode = node.nextNodeMap.get(label); | |
return nextNode; | |
} | |
Node g(Node node, String labels) { | |
Node currentNode = node; | |
for (int i = 0; i < labels.length(); i++) { | |
char label = labels.charAt(i); | |
Node nextNode = this.g(currentNode, label); | |
if (nextNode == null) { | |
return null; | |
} | |
currentNode = nextNode; | |
} | |
return currentNode; | |
} | |
Node f(Node currentNode) { | |
return currentNode.failureNode; | |
} | |
static class Node { | |
TreeMap<Character, Node> nextNodeMap; | |
Node failureNode; | |
ArrayList<Integer> lengths; | |
Node() { | |
this.nextNodeMap = new TreeMap<Character, Node>(); | |
this.failureNode = null; | |
this.lengths = new ArrayList<Integer>(); | |
} | |
} | |
public static class Occurrence { | |
public int position; | |
public int length; | |
public Occurrence(int position, int length) { | |
this.position = position; | |
this.length = length; | |
} | |
@Override | |
public boolean equals(Object object) { | |
if (this == object) { | |
return true; | |
} | |
if (object == null) { | |
return false; | |
} | |
if (!(object instanceof Occurrence)) { | |
return false; | |
} | |
Occurrence other = (Occurrence)object; | |
if (this.position != other.position) { | |
return false; | |
} | |
if (this.length != other.length) { | |
return false; | |
} | |
return true; | |
} | |
@Override | |
public int hashCode() { | |
final int prime = 31; | |
int result = 1; | |
result = prime * result + this.position; | |
result = prime * result + this.length; | |
return result; | |
} | |
@Override | |
public String toString() { | |
StringBuilder builder = new StringBuilder(); | |
builder.append('{'); | |
builder.append("\"position:\""); | |
builder.append(this.position); | |
builder.append(",\"length:\""); | |
builder.append(this.length); | |
builder.append('}'); | |
return builder.toString(); | |
} | |
} | |
public static class Result { | |
public ArrayList<Occurrence> occurrences; | |
public Result() { | |
this.occurrences = new ArrayList<Occurrence>(); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import static org.hamcrest.core.IsEqual.equalTo; | |
import static org.junit.Assert.assertThat; | |
import java.util.ArrayList; | |
import java.util.List; | |
import org.junit.Test; | |
public class ACTest { | |
@Test | |
public void test() { | |
String[] keywords = new String[] {"he", "hers", "his", "she"}; | |
String text = "a his hoge hershe xx."; | |
@SuppressWarnings("serial") | |
List<AC.Occurrence> occurrences = new ArrayList<AC.Occurrence>() {{ | |
add(new AC.Occurrence( 2, 3)); | |
add(new AC.Occurrence(11, 2)); | |
add(new AC.Occurrence(11, 4)); | |
add(new AC.Occurrence(14, 3)); | |
add(new AC.Occurrence(15, 2)); | |
}}; | |
AC ac = AC.create(keywords); | |
AC.Result result = ac.match(text); | |
for (int i = 0; i < occurrences.size(); i++) { | |
AC.Occurrence expectedOccurrence = occurrences.get(i); | |
AC.Occurrence actualOccurrence = result.occurrences.get(i); | |
assertThat(expectedOccurrence, equalTo(actualOccurrence)); | |
} | |
} | |
} |