swiftlang / swift-experimental-string-processing

An early experimental general-purpose pattern matching engine for Swift.

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Wider than expected performance gap between NSRegularExpression and StringProcessing

fwgreen opened this issue · comments

I know that the optimization work is yet to come, but I'm wondering if this could be caused by something else. I have two versions of RegexRedux and, on my M1, processing the full 50 MB text file takes roughly 9 seconds with NSRegularExpression and over 3 minutes with StringProcessing:

import Foundation

extension String {
  func countMatches(of pattern: String) -> Int {
    let regex = try! NSRegularExpression(pattern: pattern)
    let range = NSRange(location: 0, length: self.count)
    return regex.numberOfMatches(in: self, range: range)
  }
}

let input = String(data: FileHandle.standardInput.readDataToEndOfFile(), encoding: .utf8)!

let sequence = input.replacingOccurrences(of: #">[^\n]*\n|\n"#, with: "", options: .regularExpression)

let resultLength = Task.detached { 
    [
      (regex: "tHa[Nt]",            replacement: "<4>"),
      (regex: "aND|caN|Ha[DS]|WaS", replacement: "<3>"),
      (regex: "a[NSt]|BY",          replacement: "<2>"),
      (regex: "<[^>]*>",            replacement: "|"),
      (regex: "\\|[^|][^|]*\\|",    replacement: "-")
    ].reduce(sequence) { buffer, iub in
      return buffer.replacingOccurrences(of: iub.regex, with: iub.replacement, options: .regularExpression)
    }.count
}

let variants = [
  "agggtaaa|tttaccct",
  "[cgt]gggtaaa|tttaccc[acg]",
  "a[act]ggtaaa|tttacc[agt]t",
  "ag[act]gtaaa|tttac[agt]ct",
  "agg[act]taaa|ttta[agt]cct",
  "aggg[acg]aaa|ttt[cgt]ccct",
  "agggt[cgt]aa|tt[acg]accct",
  "agggta[cgt]a|t[acg]taccct",
  "agggtaa[cgt]|[acg]ttaccct"
]

await withTaskGroup(of: (variant: String, count: Int).self) { group in
  for variant in variants {
    group.addTask { (variant, sequence.countMatches(of: variant)) }
  }

  let counts = await group.reduce(into: [:]) { $0[$1.variant] = $1.count }

  for variant in variants  {
    print(variant, counts[variant] ?? 0)
  }    
}

print("", input.count, sequence.count, await resultLength.value, separator: "\n")

Except for the countMatches extension on String, I've tried to keep both programs roughly the same.

import Foundation

let input = String(data: FileHandle.standardInput.readDataToEndOfFile(), encoding: .utf8)!

let sequence = input.replacing(try! Regex(">[^\n]*\n|\n"), with: "")

let resultLength = Task.detached { 
    [
      (regex: "tHa[Nt]",            replacement: "<4>"),
      (regex: "aND|caN|Ha[DS]|WaS", replacement: "<3>"),
      (regex: "a[NSt]|BY",          replacement: "<2>"),
      (regex: "<[^>]*>",            replacement: "|"),
      (regex: "\\|[^|][^|]*\\|",    replacement: "-")
    ].reduce(sequence) { buffer, iub in
      return buffer.replacing(try! Regex(iub.regex), with: iub.replacement) 
    }.count
}

let variants = [
  "agggtaaa|tttaccct",
  "[cgt]gggtaaa|tttaccc[acg]",
  "a[act]ggtaaa|tttacc[agt]t",
  "ag[act]gtaaa|tttac[agt]ct",
  "agg[act]taaa|ttta[agt]cct",
  "aggg[acg]aaa|ttt[cgt]ccct",
  "agggt[cgt]aa|tt[acg]accct",
  "agggta[cgt]a|t[acg]taccct",
  "agggtaa[cgt]|[acg]ttaccct"
]

await withTaskGroup(of: (variant: String, count: Int).self) { group in
  for variant in variants {
    group.addTask { (variant, sequence.matches(of: try! Regex(variant)).count) }
  }

  let counts = await group.reduce(into: [:]) { $0[$1.variant] = $1.count }

  for variant in variants  {
    print(variant, counts[variant] ?? 0)
  }  
}

print("", input.count, sequence.count, await resultLength.value, separator: "\n")

Hopefully I'm using the right compiler flags:

swiftc RegexRedux.swift -Ounchecked -o RegexRedux
./RegexRedux 0 < input.txt

Thanks for all your hard work on this project.