From f5364ded1eed59b1ca3da2d552925234ac6bf91a Mon Sep 17 00:00:00 2001 From: imxyy_soope_ Date: Sun, 8 Feb 2026 01:11:06 +0800 Subject: [PATCH] feat: handle regex operations on rust side --- nix-js/runtime-ts/src/builtins/string.ts | 113 +++-------------------- nix-js/runtime-ts/src/types/global.d.ts | 2 + nix-js/src/runtime.rs | 4 + nix-js/src/runtime/ops.rs | 95 +++++++++++++++++++ 4 files changed, 115 insertions(+), 99 deletions(-) diff --git a/nix-js/runtime-ts/src/builtins/string.ts b/nix-js/runtime-ts/src/builtins/string.ts index af32e51..9556ddb 100644 --- a/nix-js/runtime-ts/src/builtins/string.ts +++ b/nix-js/runtime-ts/src/builtins/string.ts @@ -80,81 +80,17 @@ export const concatStringsSep = return mkStringWithContext(result, context); }; -const POSIX_CLASSES: Record = { - alnum: "a-zA-Z0-9", - alpha: "a-zA-Z", - blank: " \\t", - digit: "0-9", - lower: "a-z", - upper: "A-Z", - space: "\\s", - xdigit: "0-9A-Fa-f", - punct: "\\-!\"#$%&'()*+,./:;<=>?@[\\\\\\]^_`{|}~", -}; - -function posixToJsRegex(pattern: string, fullMatch: boolean = false): RegExp { - let jsPattern = pattern; - - jsPattern = jsPattern.replace(/\[(\^?)(?:\[:(\w+):\])+\]/g, (match) => { - const isNegated = match[1] === "^"; - const classNames = [...match.matchAll(/\[:(\w+):\]/g)].map((m) => m[1]); - - const combined = classNames - .map((className) => { - const replacement = POSIX_CLASSES[className]; - if (!replacement) { - throw new Error(`Unknown POSIX character class: ${className}`); - } - return replacement; - }) - .join(""); - - return isNegated ? `[^${combined}]` : `[${combined}]`; - }); - - jsPattern = jsPattern.replace(/\[:(\w+):\]/g, (_match, className) => { - const replacement = POSIX_CLASSES[className]; - if (!replacement) { - throw new Error(`Unknown POSIX character class: ${className}`); - } - return replacement; - }); - - if (fullMatch) { - if (!jsPattern.startsWith("^")) { - jsPattern = "^" + jsPattern; - } - if (!jsPattern.endsWith("$")) { - jsPattern = jsPattern + "$"; - } - } - - return new RegExp(jsPattern, "u"); -} - export const match = (regex: NixValue) => (str: NixValue): NixValue => { const regexStr = forceStringValue(regex); const inputStr = forceStringValue(str); - try { - const re = posixToJsRegex(regexStr, true); - const result = inputStr.match(re); - - if (!result) { - return null; - } - - const groups: NixValue[] = []; - for (let i = 1; i < result.length; i++) { - groups.push(result[i] !== undefined ? result[i] : null); - } - - return groups; - } catch (e) { - throw new Error(`Invalid regular expression '${regexStr}': ${e}`); + const result = Deno.core.ops.op_match(regexStr, inputStr); + if (result === null) { + return null; } + return result.map((g) => (g !== null ? g : null)); }; export const split = @@ -164,37 +100,16 @@ export const split = const inputStr = forceString(str); const inputStrValue = getStringValue(inputStr); - try { - const re = posixToJsRegex(regexStr); - const reGlobal = new RegExp(re.source, re.flags + "g"); + const result = Deno.core.ops.op_split(regexStr, inputStrValue); - const result: NixValue[] = []; - let lastIndex = 0; - let match: RegExpExecArray | null; - - while ((match = reGlobal.exec(inputStrValue)) !== null) { - result.push(inputStrValue.substring(lastIndex, match.index)); - - const groups: NixValue[] = []; - for (let i = 1; i < match.length; i++) { - groups.push(match[i] !== undefined ? match[i] : null); - } - result.push(groups); - - lastIndex = match.index + match[0].length; - - if (match[0].length === 0) { - reGlobal.lastIndex++; - } - } - - if (lastIndex === 0) { - return [inputStr]; - } - - result.push(inputStrValue.substring(lastIndex)); - return result; - } catch (e) { - throw new Error(`Invalid regular expression '${regexStr}': ${e}`); + if (result.length === 1 && typeof result[0] === "string") { + return [inputStr]; } + + return result.map((item) => { + if (typeof item === "string") { + return item; + } + return item.map((g) => (g !== null ? g : null)); + }); }; diff --git a/nix-js/runtime-ts/src/types/global.d.ts b/nix-js/runtime-ts/src/types/global.d.ts index f38d87d..0667c43 100644 --- a/nix-js/runtime-ts/src/types/global.d.ts +++ b/nix-js/runtime-ts/src/types/global.d.ts @@ -93,6 +93,8 @@ declare global { sha256: string | null, include_paths: string[], ): string; + function op_match(regex: string, text: string): (string | null)[] | null; + function op_split(regex: string, text: string): (string | (string | null)[])[]; } } } diff --git a/nix-js/src/runtime.rs b/nix-js/src/runtime.rs index 3ea78d5..d915aca 100644 --- a/nix-js/src/runtime.rs +++ b/nix-js/src/runtime.rs @@ -63,6 +63,8 @@ fn runtime_extension() -> Extension { op_get_env(), op_walk_dir(), op_add_filtered_path::(), + op_match(), + op_split(), ]; ops.extend(crate::fetcher::register_ops::()); @@ -136,6 +138,8 @@ impl Runtime { ..Default::default() }); + js_runtime.op_state().borrow_mut().put(RegexCache::new()); + let ( is_thunk_symbol, primop_metadata_symbol, diff --git a/nix-js/src/runtime/ops.rs b/nix-js/src/runtime/ops.rs index 3913191..7af84c1 100644 --- a/nix-js/src/runtime/ops.rs +++ b/nix-js/src/runtime/ops.rs @@ -1,13 +1,39 @@ use std::path::{Component, Path, PathBuf}; use std::sync::Arc; +use hashbrown::hash_map::{HashMap, Entry}; + use deno_core::OpState; +use regex::Regex; use rust_embed::Embed; use crate::error::Source; use super::{NixRuntimeError, OpStateExt, RuntimeContext}; +#[derive(Debug, Default)] +pub(super) struct RegexCache { + cache: HashMap, +} + +impl RegexCache { + pub(super) fn new() -> Self { + Self { + cache: HashMap::new(), + } + } + + fn get_regex(&mut self, pattern: &str) -> Result { + Ok(match self.cache.entry(pattern.to_string()) { + Entry::Occupied(occupied) => occupied.get().clone(), + Entry::Vacant(vacant) => { + let re = Regex::new(pattern)?; + vacant.insert(re).clone() + } + }) + } +} + #[derive(Embed)] #[folder = "src/runtime/corepkgs"] pub(crate) struct CorePkgs; @@ -639,3 +665,72 @@ pub(super) fn op_add_filtered_path( Ok(store_path) } + +#[deno_core::op2] +#[serde] +pub(super) fn op_match( + state: &mut OpState, + #[string] regex: String, + #[string] text: String, +) -> std::result::Result>>, NixRuntimeError> { + let cache = state.borrow_mut::(); + let re = cache + .get_regex(&format!("^{}$", regex)) + .map_err(|_| NixRuntimeError::from(format!("invalid regular expression '{}'", regex)))?; + + match re.captures(&text) { + Some(caps) => { + let groups: Vec> = caps + .iter() + .skip(1) + .map(|grp| grp.map(|g| g.as_str().to_string())) + .collect(); + Ok(Some(groups)) + } + None => Ok(None), + } +} + +#[deno_core::op2] +#[serde] +pub(super) fn op_split( + state: &mut OpState, + #[string] regex: String, + #[string] text: String, +) -> std::result::Result, NixRuntimeError> { + let cache = state.borrow_mut::(); + let re = cache + .get_regex(®ex) + .map_err(|_| NixRuntimeError::from(format!("invalid regular expression '{}'", regex)))?; + + let mut capture_locations = re.capture_locations(); + let num_captures = capture_locations.len(); + let mut ret: Vec = Vec::new(); + let mut pos = 0; + + while let Some(thematch) = re.captures_read_at(&mut capture_locations, &text, pos) { + ret.push(SplitResult::Text(text[pos..thematch.start()].to_string())); + + let captures: Vec> = (1..num_captures) + .map(|i| capture_locations.get(i)) + .map(|o| o.map(|(start, end)| text[start..end].to_string())) + .collect(); + ret.push(SplitResult::Captures(captures)); + + if pos == text.len() { + break; + } + pos = thematch.end(); + } + + ret.push(SplitResult::Text(text[pos..].to_string())); + + Ok(ret) +} + +#[derive(serde::Serialize)] +#[serde(untagged)] +pub(super) enum SplitResult { + Text(String), + Captures(Vec>), +}