feat: handle regex operations on rust side
This commit is contained in:
@@ -80,81 +80,17 @@ export const concatStringsSep =
|
||||
return mkStringWithContext(result, context);
|
||||
};
|
||||
|
||||
const POSIX_CLASSES: Record<string, string> = {
|
||||
alnum: "a-zA-Z0-9",
|
||||
alpha: "a-zA-Z",
|
||||
blank: " \\t",
|
||||
digit: "0-9",
|
||||
lower: "a-z",
|
||||
upper: "A-Z",
|
||||
space: "\\s",
|
||||
xdigit: "0-9A-Fa-f",
|
||||
punct: "\\-!\"#$%&'()*+,./:;<=>?@[\\\\\\]^_`{|}~",
|
||||
};
|
||||
|
||||
function posixToJsRegex(pattern: string, fullMatch: boolean = false): RegExp {
|
||||
let jsPattern = pattern;
|
||||
|
||||
jsPattern = jsPattern.replace(/\[(\^?)(?:\[:(\w+):\])+\]/g, (match) => {
|
||||
const isNegated = match[1] === "^";
|
||||
const classNames = [...match.matchAll(/\[:(\w+):\]/g)].map((m) => m[1]);
|
||||
|
||||
const combined = classNames
|
||||
.map((className) => {
|
||||
const replacement = POSIX_CLASSES[className];
|
||||
if (!replacement) {
|
||||
throw new Error(`Unknown POSIX character class: ${className}`);
|
||||
}
|
||||
return replacement;
|
||||
})
|
||||
.join("");
|
||||
|
||||
return isNegated ? `[^${combined}]` : `[${combined}]`;
|
||||
});
|
||||
|
||||
jsPattern = jsPattern.replace(/\[:(\w+):\]/g, (_match, className) => {
|
||||
const replacement = POSIX_CLASSES[className];
|
||||
if (!replacement) {
|
||||
throw new Error(`Unknown POSIX character class: ${className}`);
|
||||
}
|
||||
return replacement;
|
||||
});
|
||||
|
||||
if (fullMatch) {
|
||||
if (!jsPattern.startsWith("^")) {
|
||||
jsPattern = "^" + jsPattern;
|
||||
}
|
||||
if (!jsPattern.endsWith("$")) {
|
||||
jsPattern = jsPattern + "$";
|
||||
}
|
||||
}
|
||||
|
||||
return new RegExp(jsPattern, "u");
|
||||
}
|
||||
|
||||
export const match =
|
||||
(regex: NixValue) =>
|
||||
(str: NixValue): NixValue => {
|
||||
const regexStr = forceStringValue(regex);
|
||||
const inputStr = forceStringValue(str);
|
||||
|
||||
try {
|
||||
const re = posixToJsRegex(regexStr, true);
|
||||
const result = inputStr.match(re);
|
||||
|
||||
if (!result) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const groups: NixValue[] = [];
|
||||
for (let i = 1; i < result.length; i++) {
|
||||
groups.push(result[i] !== undefined ? result[i] : null);
|
||||
}
|
||||
|
||||
return groups;
|
||||
} catch (e) {
|
||||
throw new Error(`Invalid regular expression '${regexStr}': ${e}`);
|
||||
const result = Deno.core.ops.op_match(regexStr, inputStr);
|
||||
if (result === null) {
|
||||
return null;
|
||||
}
|
||||
return result.map((g) => (g !== null ? g : null));
|
||||
};
|
||||
|
||||
export const split =
|
||||
@@ -164,37 +100,16 @@ export const split =
|
||||
const inputStr = forceString(str);
|
||||
const inputStrValue = getStringValue(inputStr);
|
||||
|
||||
try {
|
||||
const re = posixToJsRegex(regexStr);
|
||||
const reGlobal = new RegExp(re.source, re.flags + "g");
|
||||
const result = Deno.core.ops.op_split(regexStr, inputStrValue);
|
||||
|
||||
const result: NixValue[] = [];
|
||||
let lastIndex = 0;
|
||||
let match: RegExpExecArray | null;
|
||||
|
||||
while ((match = reGlobal.exec(inputStrValue)) !== null) {
|
||||
result.push(inputStrValue.substring(lastIndex, match.index));
|
||||
|
||||
const groups: NixValue[] = [];
|
||||
for (let i = 1; i < match.length; i++) {
|
||||
groups.push(match[i] !== undefined ? match[i] : null);
|
||||
}
|
||||
result.push(groups);
|
||||
|
||||
lastIndex = match.index + match[0].length;
|
||||
|
||||
if (match[0].length === 0) {
|
||||
reGlobal.lastIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
if (lastIndex === 0) {
|
||||
return [inputStr];
|
||||
}
|
||||
|
||||
result.push(inputStrValue.substring(lastIndex));
|
||||
return result;
|
||||
} catch (e) {
|
||||
throw new Error(`Invalid regular expression '${regexStr}': ${e}`);
|
||||
if (result.length === 1 && typeof result[0] === "string") {
|
||||
return [inputStr];
|
||||
}
|
||||
|
||||
return result.map((item) => {
|
||||
if (typeof item === "string") {
|
||||
return item;
|
||||
}
|
||||
return item.map((g) => (g !== null ? g : null));
|
||||
});
|
||||
};
|
||||
|
||||
2
nix-js/runtime-ts/src/types/global.d.ts
vendored
2
nix-js/runtime-ts/src/types/global.d.ts
vendored
@@ -93,6 +93,8 @@ declare global {
|
||||
sha256: string | null,
|
||||
include_paths: string[],
|
||||
): string;
|
||||
function op_match(regex: string, text: string): (string | null)[] | null;
|
||||
function op_split(regex: string, text: string): (string | (string | null)[])[];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,6 +63,8 @@ fn runtime_extension<Ctx: RuntimeContext>() -> Extension {
|
||||
op_get_env(),
|
||||
op_walk_dir(),
|
||||
op_add_filtered_path::<Ctx>(),
|
||||
op_match(),
|
||||
op_split(),
|
||||
];
|
||||
ops.extend(crate::fetcher::register_ops::<Ctx>());
|
||||
|
||||
@@ -136,6 +138,8 @@ impl<Ctx: RuntimeContext> Runtime<Ctx> {
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
js_runtime.op_state().borrow_mut().put(RegexCache::new());
|
||||
|
||||
let (
|
||||
is_thunk_symbol,
|
||||
primop_metadata_symbol,
|
||||
|
||||
@@ -1,13 +1,39 @@
|
||||
use std::path::{Component, Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
|
||||
use hashbrown::hash_map::{HashMap, Entry};
|
||||
|
||||
use deno_core::OpState;
|
||||
use regex::Regex;
|
||||
use rust_embed::Embed;
|
||||
|
||||
use crate::error::Source;
|
||||
|
||||
use super::{NixRuntimeError, OpStateExt, RuntimeContext};
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub(super) struct RegexCache {
|
||||
cache: HashMap<String, Regex>,
|
||||
}
|
||||
|
||||
impl RegexCache {
|
||||
pub(super) fn new() -> Self {
|
||||
Self {
|
||||
cache: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_regex(&mut self, pattern: &str) -> Result<Regex, regex::Error> {
|
||||
Ok(match self.cache.entry(pattern.to_string()) {
|
||||
Entry::Occupied(occupied) => occupied.get().clone(),
|
||||
Entry::Vacant(vacant) => {
|
||||
let re = Regex::new(pattern)?;
|
||||
vacant.insert(re).clone()
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Embed)]
|
||||
#[folder = "src/runtime/corepkgs"]
|
||||
pub(crate) struct CorePkgs;
|
||||
@@ -639,3 +665,72 @@ pub(super) fn op_add_filtered_path<Ctx: RuntimeContext>(
|
||||
|
||||
Ok(store_path)
|
||||
}
|
||||
|
||||
#[deno_core::op2]
|
||||
#[serde]
|
||||
pub(super) fn op_match(
|
||||
state: &mut OpState,
|
||||
#[string] regex: String,
|
||||
#[string] text: String,
|
||||
) -> std::result::Result<Option<Vec<Option<String>>>, NixRuntimeError> {
|
||||
let cache = state.borrow_mut::<RegexCache>();
|
||||
let re = cache
|
||||
.get_regex(&format!("^{}$", regex))
|
||||
.map_err(|_| NixRuntimeError::from(format!("invalid regular expression '{}'", regex)))?;
|
||||
|
||||
match re.captures(&text) {
|
||||
Some(caps) => {
|
||||
let groups: Vec<Option<String>> = caps
|
||||
.iter()
|
||||
.skip(1)
|
||||
.map(|grp| grp.map(|g| g.as_str().to_string()))
|
||||
.collect();
|
||||
Ok(Some(groups))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
#[deno_core::op2]
|
||||
#[serde]
|
||||
pub(super) fn op_split(
|
||||
state: &mut OpState,
|
||||
#[string] regex: String,
|
||||
#[string] text: String,
|
||||
) -> std::result::Result<Vec<SplitResult>, NixRuntimeError> {
|
||||
let cache = state.borrow_mut::<RegexCache>();
|
||||
let re = cache
|
||||
.get_regex(®ex)
|
||||
.map_err(|_| NixRuntimeError::from(format!("invalid regular expression '{}'", regex)))?;
|
||||
|
||||
let mut capture_locations = re.capture_locations();
|
||||
let num_captures = capture_locations.len();
|
||||
let mut ret: Vec<SplitResult> = Vec::new();
|
||||
let mut pos = 0;
|
||||
|
||||
while let Some(thematch) = re.captures_read_at(&mut capture_locations, &text, pos) {
|
||||
ret.push(SplitResult::Text(text[pos..thematch.start()].to_string()));
|
||||
|
||||
let captures: Vec<Option<String>> = (1..num_captures)
|
||||
.map(|i| capture_locations.get(i))
|
||||
.map(|o| o.map(|(start, end)| text[start..end].to_string()))
|
||||
.collect();
|
||||
ret.push(SplitResult::Captures(captures));
|
||||
|
||||
if pos == text.len() {
|
||||
break;
|
||||
}
|
||||
pos = thematch.end();
|
||||
}
|
||||
|
||||
ret.push(SplitResult::Text(text[pos..].to_string()));
|
||||
|
||||
Ok(ret)
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
#[serde(untagged)]
|
||||
pub(super) enum SplitResult {
|
||||
Text(String),
|
||||
Captures(Vec<Option<String>>),
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user