/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ /* * Copyright (C) 2008 Jeffrey Stedfast * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, * modify, merge, publish, distribute, sublicense, and/or sell copies * of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include "trie.h" #define d(x) x static void trie_match_free (TrieMatch *match); static void trie_state_free (TrieState *state); static void trie_match_free (TrieMatch *match) { TrieMatch *next; while (match) { next = match->next; trie_state_free (match->state); delete match; match = next; } } static void trie_state_free (TrieState *state) { trie_match_free (state->match); delete state; } Trie::Trie (bool icase) { this->icase = icase; root.next = 0; root.fail = 0; root.match = 0; root.final = 0; } Trie::~Trie () { trie_match_free (root.match); } static TrieMatch * g (TrieState *s, char c) { TrieMatch *m = s->match; while (m && m->c != c) m = m->next; return m; } TrieState * Trie::Insert (size_t depth, TrieState *q, char c) { TrieMatch *m; size_t size; m = new TrieMatch (); m->next = q->match; m->c = c; q->match = m; q = m->state = new TrieState (); q->match = 0; q->fail = &root; q->final = 0; q->id = 0; size = fail_states.size (); if (depth < size) { q->next = fail_states[depth]; fail_states[depth] = q; } else { fail_states.insert (fail_states.end (), q); q->next = 0; } return q; } #if d(!)0 static void dump_trie (TrieState *s, size_t depth) { char *p = (char *) alloca ((depth * 2) + 1); TrieMatch *m; memset (p, ' ', depth * 2); p[depth * 2] = '\0'; fprintf (stderr, "%s[state] %p: final=%d; pattern-id=%d; fail=%p\n", p, s, s->final, s->id, s->fail); m = s->match; while (m) { fprintf (stderr, " %s'%c' -> %p\n", p, m->c, m->state); if (m->state) dump_trie (m->state, depth + 1); m = m->next; } } #endif /* * final = empty set * FOR p = 1 TO #pat * q = root * FOR j = 1 TO m[p] * IF g(q, pat[p][j]) == null * insert(q, pat[p][j]) * ENDIF * q = g(q, pat[p][j]) * ENDFOR * final = union(final, q) * ENDFOR */ void Trie::AddPattern (const char *pattern, int pattern_id) { const char *inptr = (const char *) pattern; TrieState *q, *q1, *r; TrieMatch *m, *n; size_t depth = 0; char c; /* Step 1: add the pattern to the trie */ q = &root; while ((c = *inptr++)) { if (icase && (c >= 'A' && c <= 'Z')) c += 0x20; m = g (q, c); if (m == 0) { q = Insert (depth, q, c); } else { q = m->state; } depth++; } q->final = depth; q->id = pattern_id; /* Step 2: compute failure graph */ for (size_t i = 0; i < fail_states.size (); i++) { q = fail_states[i]; while (q) { m = q->match; while (m) { c = m->c; q1 = m->state; r = q->fail; while (r && (n = g (r, c)) == 0) r = r->fail; if (r != 0) { q1->fail = n->state; if (q1->fail->final > q1->final) q1->final = q1->fail->final; } else { if ((n = g (&root, c))) q1->fail = n->state; else q1->fail = &root; } m = m->next; } q = q->next; } } d(fprintf (stderr, "\nafter adding pattern '%s' to trie %p:\n", pattern, this)); d(dump_trie (&root, 0)); } /* * Aho-Corasick * * q = root * FOR i = 1 TO n * WHILE q != fail AND g(q, text[i]) == fail * q = h(q) * ENDWHILE * IF q == fail * q = root * ELSE * q = g(q, text[i]) * ENDIF * IF isElement(q, final) * RETURN TRUE * ENDIF * ENDFOR * RETURN FALSE */ const char * Trie::Search (const char *buf, size_t buflen, int *matched_id) { const char *inptr, *inend, *prev, *pat; size_t matched = 0; TrieMatch *m = 0; TrieState *q; char c; inend = buf + buflen; inptr = buf; q = &root; pat = prev = inptr; while (inptr < inend) { c = *inptr++; if (icase && (c >= 'A' && c <= 'Z')) c += 0x20; while (q != 0 && (m = g (q, c)) == 0 && matched == 0) q = q->fail; if (q == &root) { if (matched) return pat; pat = prev; } if (q == 0) { if (matched) return pat; q = &root; pat = inptr; } else if (m != 0) { q = m->state; if (q->final > matched) { if (matched_id) *matched_id = q->id; matched = q->final; } } prev = inptr; } return matched ? pat : 0; } #ifdef TEST static const char *patterns[] = { "news://", "nntp://", "telnet://", "file://", "ftp://", "http://", "https://", "www.", "ftp.", "mailto:", "file:", "@", "abcabcabc", "abcdefg", "abcabc", "abc" }; static const char *haystacks[] = { "try this url: http://www.ximian.com", "or, feel free to email me at fejj@ximian.com", "don't forget to check out www.ximian.com", "I've attached a file (file:///cvs/gmime/gmime/gtrie.c)", "this should match the first file-colon-slash-slash: file://file://", "this should match file-colon: file:/file://", "abcabcabcdefghijklmnopqrstuvwxyz" }; #define G_N_ELEMENTS(array) (sizeof (array) / sizeof (array[0])) int main (int argc, char **argv) { const char *match; Trie *trie; int id, i; trie = new Trie (true); for (i = 0; i < (int) G_N_ELEMENTS (patterns); i++) trie->AddPattern (patterns[i], i); for (i = 0; i < (int) G_N_ELEMENTS (haystacks); i++) { if ((match = trie->Search (haystacks[i], strlen (haystacks[i]), &id))) { fprintf (stderr, "matched @ '%s' with pattern '%s'\n", match, patterns[id]); } else { fprintf (stderr, "no match\n"); } } fflush (stdout); delete trie; return match != 0; } #endif /* TEST */