"""Mostly translated from: https://code.woboq.org/userspace/glibc/string/str-two-way.h.html """ def _lex_search(needle, inverted): # We'll eventually partition needle into # needle[:max_suffix + 1], needle[max_suffix + 1:] max_suffix = -1 suffix = 0 # candidate for max_suffix period = 1 # candidate for period k = 1 # index while suffix + k < len(needle): a = needle[suffix + k] b = needle[max_suffix + k] condition = (a < b) if inverted else (b < a) if condition: # suffix is smaller, period is entire prefix so far. suffix += k k = 1 period = suffix - max_suffix elif a == b: # Advance through repetition of current period if k != period: k += 1 else: suffix += period k = 1 else: # found a bigger suffix. max_suffix = suffix suffix += 1 k = 1 period = 1 return max_suffix + 1, period def _critical_factorization(needle): """ >>> x = "GCAGAGAG" >>> suf, period = _critical_factorization(x) >>> x[:suf], x[suf:] ('GC', 'AGAGAG') >>> period 2 """ # Search using both forward and backward character-orderings max_suf1, pattern1 = _lex_search(needle, inverted=False) max_suf2, pattern2 = _lex_search(needle, inverted=True) # choose bigger suffix if max_suf2 < max_suf1: return max_suf1, pattern1 else: return max_suf2, pattern2 def _two_way(needle: str, haystack: str, suffix: int, period: int): # LOG(f"Split {needle!r} into {needle[:suffix]!r} and {needle[suffix:]!r}.") # LOG(f"{needle[suffix:]!r} is {period}-periodic.") # LOG(needle[:suffix], needle[suffix:], haystack) # If the entire needle is periodic... if needle[:suffix] == needle[period:period + suffix]: # LOG(f"The whole string {needle!r} is {period}-periodic.") # entirely periodic, so a mismatch can only advance by the period. # Use memory to avoid re-scanning known occurrences of the period. memory = 0 j = 0 # index into haystack while j + len(needle) <= len(haystack): # LOG(">", haystack) # LOG(">", " "*j + needle) i = max(suffix, memory) # scan right half while i < len(needle) and needle[i] == haystack[j + i]: i += 1 if len(needle) <= i: # LOG("Right half matches.") # scan left half i = suffix - 1 while memory < i + 1 and needle[i] == haystack[j + i]: i -= 1 if i + 1 < memory + 1: # LOG(f"Left half matches. Returning {j}.") return j # no match, so remember how many repetitions of period # on the right half were scanned j += period memory = len(needle) - period else: # LOG("Skip without checking left half.") j += i - suffix + 1 memory = 0 else: # not periodic # LOG(f"The whole string {needle!r} is not periodic.") # The two halves of needle are distinct; # no extra memory is required, # and any mismatch results in a maximal shift. period = 1 + max(suffix, len(needle) - suffix) needle_suffix = needle[suffix] # LOG(f"Using period {period}.") # LOG(f"Right half starts with {needle_suffix!r}.") j = 0 while j + len(needle) <= len(haystack): # LOG(f"Fast-search for {needle_suffix!r} in {haystack!r}[{suffix+j}:{suffix + len(haystack) - len(needle)+1}]") # use optimized code for single-character search find = haystack.find(needle_suffix, suffix + j, suffix + len(haystack) - len(needle) + 1) if find == -1: # LOG(f"Not found. Return -1.") return -1 j = find - suffix # LOG(f"Found at {j}.") # LOG(f"Matching " # f"{haystack!r}[{suffix+j}:] (={haystack[suffix+j:]!r}) " # f"to right half {needle[suffix:]!r}") # LOG(">", haystack) # LOG(">", " "*j + needle) # match the right half i = suffix + 1 while i < len(needle): if needle[i] != haystack[j + i]: # LOG("No match.") break i += 1 if len(needle) <= i: # match the left half # LOG("matching the left half.") i = suffix - 1 while i != -1: if needle[i] != haystack[i + j]: break i -= 1 if i == -1: # LOG(f"Match! (at {j})") return j j += period else: # LOG(f"Jump forward without checking left half.") j += i - suffix + 1 # LOG(f"Reached end. Returning -1.") return -1 def fastsearch(needle, haystack): """ >>> fastsearch("abc", "abacaba") -1 >>> fastsearch("cab", "abacaba") 3 """ # LOG("="*60) # LOG(f"{needle!r} in {haystack!r}") if needle == '': # LOG("Easy-out: empty") return 0 first = needle[0] index = haystack.find(first) # LOG(f"Looking for {first!r} in {haystack!r}") if index == -1: # LOG("Easy-out: didn't have first char") return -1 if len(haystack) - index < len(needle): # LOG("Easy-out: string won't fit.") return -1 if haystack[index:index + len(needle)] == needle: # If obvious match, skip the initialization overhead # LOG("Easy out: obvious match") return index suffix, period = _critical_factorization(needle) result = _two_way(needle, haystack[index:], suffix, period) if result == -1: return -1 else: return result + index def count(needle, haystack): """ >>> count("a", "abacaba") 4 >>> count("aa", "aabaaabaa") 3 """ if needle == '': return len(haystack) + 1 first = needle[0] index = haystack.find(first) if index == -1: return 0 if len(haystack) - index < len(needle): return 0 suffix, period = _critical_factorization(needle) count = 0 while True: result = _two_way(needle, haystack[index:], suffix, period) if result == -1: return count else: count += 1 index += result + len(needle) def test_fastsearch(): assert fastsearch("acbc", "acbc") == 0 assert fastsearch("cab", "abacaba") == 3 assert fastsearch("bca", "abacaba") == -1 assert fastsearch("aa", "bbabaaa") == 4 # copied from translate.py # For a variety of combinations, # verify that str.find() matches __contains__ # and that the found substring is really at that location charset = ['', 'a', 'b', 'c'] digits = 5 base = len(charset) teststrings = set() for i in range(base ** digits): entry = [] for j in range(digits): i, m = divmod(i, base) entry.append(charset[m]) teststrings.add(''.join(entry)) for i in teststrings: for j in teststrings: loc1 = fastsearch(j, i) loc2 = i.find(j) assert loc1 == loc2 R = 10 ** 5 data = 'A' * R + 'B' * R + 'A' * R + 'BBB' * R pattern = 'BBB' * R assert fastsearch(pattern, data) == 3*R def test_count(): assert count('bbbaab', 'bbbaab') == 1 # For a variety of combinations, # verify that str.count() matches count() charset = ['', 'a', 'b'] digits = 7 base = len(charset) teststrings = set() for i in range(base ** digits): entry = [] for j in range(digits): i, m = divmod(i, base) entry.append(charset[m]) teststrings.add(''.join(entry)) for i in teststrings: n = len(i) for j in teststrings: r1 = i.count(j) r2 = count(j, i) assert r1 == r2, (i, j, r1, r2) def time_this(N=10**7): from time import perf_counter haystack = "ABC" * N needle1 = "DDD" + "ABC" * (N // 10) needle2 = "DDD" + "ABC" * 10 t0 = perf_counter() fastsearch(needle1, haystack) t1 = perf_counter() print("Pure Python Two-Way, shorter needle:".ljust(40), t1 - t0) t0 = perf_counter() fastsearch(needle2, haystack) t1 = perf_counter() print("Pure Python Two-Way. Longer needle:".ljust(40), t1 - t0) t0 = perf_counter() haystack.find(needle1) t1 = perf_counter() print("Python builtin, shorter needle:".ljust(40), t1 - t0) t0 = perf_counter() haystack.find(needle2) t1 = perf_counter() print("Python builtin, longer needle:".ljust(40), t1 - t0) if __name__ == "__main__": # LOG = print test_fastsearch() test_count() time_this() # import doctest # doctest.testmod()