Wider than expected performance gap between NSRegularExpression and StringProcessing
fwgreen opened this issue · comments
I know that the optimization work is yet to come, but I'm wondering if this could be caused by something else. I have two versions of RegexRedux and, on my M1, processing the full 50 MB text file takes roughly 9 seconds with NSRegularExpression and over 3 minutes with StringProcessing:
import Foundation
extension String {
func countMatches(of pattern: String) -> Int {
let regex = try! NSRegularExpression(pattern: pattern)
let range = NSRange(location: 0, length: self.count)
return regex.numberOfMatches(in: self, range: range)
}
}
let input = String(data: FileHandle.standardInput.readDataToEndOfFile(), encoding: .utf8)!
let sequence = input.replacingOccurrences(of: #">[^\n]*\n|\n"#, with: "", options: .regularExpression)
let resultLength = Task.detached {
[
(regex: "tHa[Nt]", replacement: "<4>"),
(regex: "aND|caN|Ha[DS]|WaS", replacement: "<3>"),
(regex: "a[NSt]|BY", replacement: "<2>"),
(regex: "<[^>]*>", replacement: "|"),
(regex: "\\|[^|][^|]*\\|", replacement: "-")
].reduce(sequence) { buffer, iub in
return buffer.replacingOccurrences(of: iub.regex, with: iub.replacement, options: .regularExpression)
}.count
}
let variants = [
"agggtaaa|tttaccct",
"[cgt]gggtaaa|tttaccc[acg]",
"a[act]ggtaaa|tttacc[agt]t",
"ag[act]gtaaa|tttac[agt]ct",
"agg[act]taaa|ttta[agt]cct",
"aggg[acg]aaa|ttt[cgt]ccct",
"agggt[cgt]aa|tt[acg]accct",
"agggta[cgt]a|t[acg]taccct",
"agggtaa[cgt]|[acg]ttaccct"
]
await withTaskGroup(of: (variant: String, count: Int).self) { group in
for variant in variants {
group.addTask { (variant, sequence.countMatches(of: variant)) }
}
let counts = await group.reduce(into: [:]) { $0[$1.variant] = $1.count }
for variant in variants {
print(variant, counts[variant] ?? 0)
}
}
print("", input.count, sequence.count, await resultLength.value, separator: "\n")
Except for the countMatches
extension on String
, I've tried to keep both programs roughly the same.
import Foundation
let input = String(data: FileHandle.standardInput.readDataToEndOfFile(), encoding: .utf8)!
let sequence = input.replacing(try! Regex(">[^\n]*\n|\n"), with: "")
let resultLength = Task.detached {
[
(regex: "tHa[Nt]", replacement: "<4>"),
(regex: "aND|caN|Ha[DS]|WaS", replacement: "<3>"),
(regex: "a[NSt]|BY", replacement: "<2>"),
(regex: "<[^>]*>", replacement: "|"),
(regex: "\\|[^|][^|]*\\|", replacement: "-")
].reduce(sequence) { buffer, iub in
return buffer.replacing(try! Regex(iub.regex), with: iub.replacement)
}.count
}
let variants = [
"agggtaaa|tttaccct",
"[cgt]gggtaaa|tttaccc[acg]",
"a[act]ggtaaa|tttacc[agt]t",
"ag[act]gtaaa|tttac[agt]ct",
"agg[act]taaa|ttta[agt]cct",
"aggg[acg]aaa|ttt[cgt]ccct",
"agggt[cgt]aa|tt[acg]accct",
"agggta[cgt]a|t[acg]taccct",
"agggtaa[cgt]|[acg]ttaccct"
]
await withTaskGroup(of: (variant: String, count: Int).self) { group in
for variant in variants {
group.addTask { (variant, sequence.matches(of: try! Regex(variant)).count) }
}
let counts = await group.reduce(into: [:]) { $0[$1.variant] = $1.count }
for variant in variants {
print(variant, counts[variant] ?? 0)
}
}
print("", input.count, sequence.count, await resultLength.value, separator: "\n")
Hopefully I'm using the right compiler flags:
swiftc RegexRedux.swift -Ounchecked -o RegexRedux
./RegexRedux 0 < input.txt
Thanks for all your hard work on this project.