feat: handle regex operations on rust side

This commit is contained in:
2026-02-08 01:11:06 +08:00
parent 26e7b74585
commit f5364ded1e
4 changed files with 115 additions and 99 deletions

View File

@@ -80,81 +80,17 @@ export const concatStringsSep =
return mkStringWithContext(result, context);
};
const POSIX_CLASSES: Record<string, string> = {
alnum: "a-zA-Z0-9",
alpha: "a-zA-Z",
blank: " \\t",
digit: "0-9",
lower: "a-z",
upper: "A-Z",
space: "\\s",
xdigit: "0-9A-Fa-f",
punct: "\\-!\"#$%&'()*+,./:;<=>?@[\\\\\\]^_`{|}~",
};
function posixToJsRegex(pattern: string, fullMatch: boolean = false): RegExp {
let jsPattern = pattern;
jsPattern = jsPattern.replace(/\[(\^?)(?:\[:(\w+):\])+\]/g, (match) => {
const isNegated = match[1] === "^";
const classNames = [...match.matchAll(/\[:(\w+):\]/g)].map((m) => m[1]);
const combined = classNames
.map((className) => {
const replacement = POSIX_CLASSES[className];
if (!replacement) {
throw new Error(`Unknown POSIX character class: ${className}`);
}
return replacement;
})
.join("");
return isNegated ? `[^${combined}]` : `[${combined}]`;
});
jsPattern = jsPattern.replace(/\[:(\w+):\]/g, (_match, className) => {
const replacement = POSIX_CLASSES[className];
if (!replacement) {
throw new Error(`Unknown POSIX character class: ${className}`);
}
return replacement;
});
if (fullMatch) {
if (!jsPattern.startsWith("^")) {
jsPattern = "^" + jsPattern;
}
if (!jsPattern.endsWith("$")) {
jsPattern = jsPattern + "$";
}
}
return new RegExp(jsPattern, "u");
}
export const match =
(regex: NixValue) =>
(str: NixValue): NixValue => {
const regexStr = forceStringValue(regex);
const inputStr = forceStringValue(str);
try {
const re = posixToJsRegex(regexStr, true);
const result = inputStr.match(re);
if (!result) {
return null;
}
const groups: NixValue[] = [];
for (let i = 1; i < result.length; i++) {
groups.push(result[i] !== undefined ? result[i] : null);
}
return groups;
} catch (e) {
throw new Error(`Invalid regular expression '${regexStr}': ${e}`);
const result = Deno.core.ops.op_match(regexStr, inputStr);
if (result === null) {
return null;
}
return result.map((g) => (g !== null ? g : null));
};
export const split =
@@ -164,37 +100,16 @@ export const split =
const inputStr = forceString(str);
const inputStrValue = getStringValue(inputStr);
try {
const re = posixToJsRegex(regexStr);
const reGlobal = new RegExp(re.source, re.flags + "g");
const result = Deno.core.ops.op_split(regexStr, inputStrValue);
const result: NixValue[] = [];
let lastIndex = 0;
let match: RegExpExecArray | null;
while ((match = reGlobal.exec(inputStrValue)) !== null) {
result.push(inputStrValue.substring(lastIndex, match.index));
const groups: NixValue[] = [];
for (let i = 1; i < match.length; i++) {
groups.push(match[i] !== undefined ? match[i] : null);
}
result.push(groups);
lastIndex = match.index + match[0].length;
if (match[0].length === 0) {
reGlobal.lastIndex++;
}
}
if (lastIndex === 0) {
return [inputStr];
}
result.push(inputStrValue.substring(lastIndex));
return result;
} catch (e) {
throw new Error(`Invalid regular expression '${regexStr}': ${e}`);
if (result.length === 1 && typeof result[0] === "string") {
return [inputStr];
}
return result.map((item) => {
if (typeof item === "string") {
return item;
}
return item.map((g) => (g !== null ? g : null));
});
};

View File

@@ -93,6 +93,8 @@ declare global {
sha256: string | null,
include_paths: string[],
): string;
function op_match(regex: string, text: string): (string | null)[] | null;
function op_split(regex: string, text: string): (string | (string | null)[])[];
}
}
}

View File

@@ -63,6 +63,8 @@ fn runtime_extension<Ctx: RuntimeContext>() -> Extension {
op_get_env(),
op_walk_dir(),
op_add_filtered_path::<Ctx>(),
op_match(),
op_split(),
];
ops.extend(crate::fetcher::register_ops::<Ctx>());
@@ -136,6 +138,8 @@ impl<Ctx: RuntimeContext> Runtime<Ctx> {
..Default::default()
});
js_runtime.op_state().borrow_mut().put(RegexCache::new());
let (
is_thunk_symbol,
primop_metadata_symbol,

View File

@@ -1,13 +1,39 @@
use std::path::{Component, Path, PathBuf};
use std::sync::Arc;
use hashbrown::hash_map::{HashMap, Entry};
use deno_core::OpState;
use regex::Regex;
use rust_embed::Embed;
use crate::error::Source;
use super::{NixRuntimeError, OpStateExt, RuntimeContext};
#[derive(Debug, Default)]
pub(super) struct RegexCache {
cache: HashMap<String, Regex>,
}
impl RegexCache {
pub(super) fn new() -> Self {
Self {
cache: HashMap::new(),
}
}
fn get_regex(&mut self, pattern: &str) -> Result<Regex, regex::Error> {
Ok(match self.cache.entry(pattern.to_string()) {
Entry::Occupied(occupied) => occupied.get().clone(),
Entry::Vacant(vacant) => {
let re = Regex::new(pattern)?;
vacant.insert(re).clone()
}
})
}
}
#[derive(Embed)]
#[folder = "src/runtime/corepkgs"]
pub(crate) struct CorePkgs;
@@ -639,3 +665,72 @@ pub(super) fn op_add_filtered_path<Ctx: RuntimeContext>(
Ok(store_path)
}
#[deno_core::op2]
#[serde]
pub(super) fn op_match(
state: &mut OpState,
#[string] regex: String,
#[string] text: String,
) -> std::result::Result<Option<Vec<Option<String>>>, NixRuntimeError> {
let cache = state.borrow_mut::<RegexCache>();
let re = cache
.get_regex(&format!("^{}$", regex))
.map_err(|_| NixRuntimeError::from(format!("invalid regular expression '{}'", regex)))?;
match re.captures(&text) {
Some(caps) => {
let groups: Vec<Option<String>> = caps
.iter()
.skip(1)
.map(|grp| grp.map(|g| g.as_str().to_string()))
.collect();
Ok(Some(groups))
}
None => Ok(None),
}
}
#[deno_core::op2]
#[serde]
pub(super) fn op_split(
state: &mut OpState,
#[string] regex: String,
#[string] text: String,
) -> std::result::Result<Vec<SplitResult>, NixRuntimeError> {
let cache = state.borrow_mut::<RegexCache>();
let re = cache
.get_regex(&regex)
.map_err(|_| NixRuntimeError::from(format!("invalid regular expression '{}'", regex)))?;
let mut capture_locations = re.capture_locations();
let num_captures = capture_locations.len();
let mut ret: Vec<SplitResult> = Vec::new();
let mut pos = 0;
while let Some(thematch) = re.captures_read_at(&mut capture_locations, &text, pos) {
ret.push(SplitResult::Text(text[pos..thematch.start()].to_string()));
let captures: Vec<Option<String>> = (1..num_captures)
.map(|i| capture_locations.get(i))
.map(|o| o.map(|(start, end)| text[start..end].to_string()))
.collect();
ret.push(SplitResult::Captures(captures));
if pos == text.len() {
break;
}
pos = thematch.end();
}
ret.push(SplitResult::Text(text[pos..].to_string()));
Ok(ret)
}
#[derive(serde::Serialize)]
#[serde(untagged)]
pub(super) enum SplitResult {
Text(String),
Captures(Vec<Option<String>>),
}