suppressPackageStartupMessages({
library(glue)
library(duckdb)
library(stringi)
library(tidyverse)
library(tidymodels)
library(tidyclust)
library(hrbrthemes)
})
This is a basic example of how to work with DuckDB’s new vector search superpowers.
KEV Similarity Search With DuckDB
We’re using for some of the data ops (and for doing some other things with the embeddings we’ll be generating), and we’ll need quite a bit of help outside of base R:
We’ll also need some helper functions to talk to the LLMs we’ll be using:
# helper to talk to the llamafile embeddings server
# NOT vectorized
<- \(.x, host = "localhost", port = 8080) {
embed
::POST(
httrurl = glue("http://{host}:{port}/embedding"),
::content_type_json(),
httrencode = "json",
body = list(
content = .x
)-> res
)
::content(res, as = "text") |>
httr::fromJSON() |>
jsonlitegetElement("embedding")
}
# helper to chat with ollama
# NOT vectorized
<- \(model = "llama3:latest", prompt = "What color is the sky?") {
ollama_generate
::POST(
httrurl = "http://localhost:11434/api/generate",
::content_type_json(),
httrencode = "json",
body = list(
model = model,
stream = FALSE,
prompt = prompt
)-> res
)
::content(res, as = "text") |>
httr::fromJSON()
jsonlite
}
And, since I haven’t found an R driver that works great with the new arrays/vector search in DuckDB, we’ll be doing some SQL generation, so a helper for quoting is in order:
# helper for SQL string escaping
# vectorized
<- \(.x) {
esc as.character(DBI::dbQuoteString(DBI::ANSI(), .x))
}
Now, let’s read in the KEV dataset:
# KEV!
::fromJSON("data/kev.json") |>
jsonlitegetElement("vulnerabilities") -> kev
as_tibble(kev) |>
head()
# A tibble: 6 × 11
cveID vendorProject product vulnerabilityName dateAdded shortDescription
<chr> <chr> <chr> <chr> <chr> <chr>
1 CVE-2021-2… Accellion FTA Accellion FTA OS… 2021-11-… Accellion FTA c…
2 CVE-2021-2… Accellion FTA Accellion FTA OS… 2021-11-… Accellion FTA c…
3 CVE-2021-2… Accellion FTA Accellion FTA SQ… 2021-11-… Accellion FTA c…
4 CVE-2021-2… Accellion FTA Accellion FTA Se… 2021-11-… Accellion FTA c…
5 CVE-2021-2… Adobe Acroba… Adobe Acrobat an… 2021-11-… Acrobat Acrobat…
6 CVE-2021-2… Adobe Acroba… Adobe Acrobat an… 2021-11-… Adobe Acrobat a…
# ℹ 5 more variables: requiredAction <chr>, dueDate <chr>,
# knownRansomwareCampaignUse <chr>, notes <chr>, cwes <list>
We’re going to generate embeddings from the vulnerabilityName
, vendorProject
, product
, and shortDescription
columns. This means you need to fire up the mixedbread model:
$ mxbai-embed-large-v1-f16.llamafile --server --nobrowser --embeddings
If you’re like me and run way too many things on your system, you’ll want to look for a line like this in the output in the console:
{"function":"server_cli","hostname":"127.0.0.1","level":"INFO","line":3200,"msg":"HTTP server listening","port":"8081","tid":"1099517127680","timestamp":1723199761}
I had something running on 8080
, so it picked 8081
, and we need to tell the embedding function that.
Now, let’s clean up some of the KEV fields and generate the embeddings:
# clean up KEV and generate embeddings
|>
kev mutate(
shortDescription = stri_replace_all_regex(shortDescription, "[[:space:]]", " "),
ransomware = ifelse(knownRansomwareCampaignUse == "Known", "used in ransomware", ""),
embedding = glue("{vulnerabilityName} {vendorProject} {product} {shortDescription} {ransomware}") |>
map(embed, port = 8081, .progress = TRUE)
|>
) select(
cve_id = cveID,
vendor = vendorProject,
product,description = shortDescription,
embedding-> kev_with_embeddings )
Except, I’m going to cheat and use a saved off version of it (you can, too!). It doesn’t take long to generate the embeddings (I’m using an M1 MacBook Pro Max) but loading a Parquet file is way faster.
<- arrow::read_parquet("kev-with-embeddings.parquet")
kev_with_embeddings
glimpse(kev_with_embeddings)
Rows: 1,143
Columns: 5
$ cve_id <chr> "CVE-2021-27104", "CVE-2021-27102", "CVE-2021-27101", "CVE…
$ vendor <chr> "Accellion", "Accellion", "Accellion", "Accellion", "Adobe…
$ product <chr> "FTA", "FTA", "FTA", "FTA", "Acrobat and Reader", "Acrobat…
$ description <chr> "Accellion FTA contains an OS command injection vulnerabil…
$ embedding <list<double>> <7.690168e-03, -6.912039e-03, -1.226835e-02, 6.11…
We need to know the length of the embeddings vector that the model generates since we need to tell DuckDB the size for the array column (it’s a fixed-size column). We could read the model page or just ask the data frame:
<- length(kev_with_embeddings$embedding[[1]])) (embeddings_length
[1] 1024
Now we need to generate some good ol’ SQL to run at the CLI (if you have the DuckDB vector thing working in R def drop me a note! #pls
):
# turn embeddings R float array into a string for ddb
|>
kev_with_embeddings getElement("embedding") |>
map_chr(\(.x) {
paste0(.x, collapse = ", ")
-> embeddings_for_insert
})
# make the INSERT calls
|>
kev_with_embeddings with(glue(
"(INSERT INTO kev VALUES( {esc(cve_id)}, {esc(vendor)}, {esc(product)}, {esc(description)}, [ {embeddings_for_insert} ] );)"
r-> inserts
))
# eject it to the filesystem
cat(
as.character(glue::glue(r"(
INSTALL vss;
LOAD vss;
SET hnsw_enable_experimental_persistence = true;
CREATE TABLE kev (
cve_id TEXT,
vendor TEXT,
product TEXT,
description TEXT,
vec FLOAT[{embeddings_length}]
);
)")),
inserts,"\nCREATE INDEX kev_hnsw_index ON kev USING HNSW (vec);",
sep = "\n",
file = "kev-embed.sql"
)
Let’s fire that off and see how long it takes:
system.time(system(r"(
(rm -rf kev-embed.db && duckdb kev-embed.db < kev-embed.sql
)", intern = TRUE)))
user system elapsed
6.753 8.652 15.025
PLEASE read the fine print on persisting these embeddings.
Now, we can have some fun with the data in DuckDB:
<- dbConnect(duckdb(), "kev-embed.db")
con
dbGetQuery(con, r"(WITH query AS (
SELECT vec
FROM kev
WHERE cve_id = 'CVE-2021-44228' -- Log4Shell as an example
)
SELECT
kv.cve_id,
kv.description,
kv.product,
array_distance(kv.vec, q.vec) AS distance
FROM
kev kv,
query q
ORDER BY
array_distance(kv.vec, q.vec)
LIMIT 10;
)") -> log4j_alikes
as_tibble(log4j_alikes)
# A tibble: 10 × 4
cve_id description product distance
<chr> <chr> <chr> <dbl>
1 CVE-2021-44228 Apache Log4j2 contains a vulnerability where… Log4j2 0
2 CVE-2021-45046 Apache Log4j2 contains a deserialization of … Log4j2 0.527
3 CVE-2016-8735 Apache Tomcat contains an unspecified vulner… Tomcat 0.677
4 CVE-2017-5638 Apache Struts Jakarta Multipart parser allow… Struts 0.708
5 CVE-2017-12615 When running Apache Tomcat on Windows with H… Tomcat 0.725
6 CVE-2017-12149 The JBoss Application Server, shipped with R… JBoss … 0.725
7 CVE-2010-1871 JBoss Seam 2 (jboss-seam2), as used in JBoss… JBoss … 0.734
8 CVE-2015-2590 An unspecified vulnerability exists within O… Java SE 0.743
9 CVE-2022-24112 Apache APISIX contains an authentication byp… APISIX 0.747
10 CVE-2012-0507 An incorrect type vulnerability exists in th… Java SE 0.749
An exercise for the reader is to grab the same fields from a non-KEV CVE, generate the embeddings, and see which KEV entries it may be close to (which could be a signal it’ll end up in KEV some day).
Also, try other models that take a larger contenxt for the embeddings and see if that changes anything (hint, it kinda does to some degree).
Clustering KEV With Embeddings
Since we have a corpus of embeddings, why not play with them in R a bit?
Let’s turn it into a proper matrix:
do.call(rbind, kev_with_embeddings$embedding) |>
setNames(kev_with_embeddings$cve_id) -> kev_embed_mat
We’ll try a basic kmeans on the corpus with a (bad) guess at the number of groups:
|>
kev_embed_mat as_tibble(.name_repair = NULL) -> kev_embed_df
Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
`.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.
k_means(num_clusters = 10) %>%
set_engine(
engine = "stats",
-> kmeans_spec
)
<- recipe(~., data = kev_embed_df)
embeddings_recipe
workflow() |>
add_model(kmeans_spec) |>
add_recipe(embeddings_recipe) -> kmeans_wf
%>%
kmeans_wf fit(data = kev_embed_df) -> kmeans_fit
%>%
kmeans_fit extract_fit_engine() %>%
getElement("cluster") -> cluster_assignments
<- prcomp(kev_embed_df, center = TRUE, scale. = TRUE) pca_result
tibble(
PC1 = pca_result$x[,1],
PC2 = pca_result$x[,2],
Cluster = as.factor(cluster_assignments),
cve = kev$cveID
-> xdf
)
|>
xdf ggplot() +
geom_text(
aes(x = PC1, y = PC2, color = Cluster, label = cve),
alpha = 0.7,
size = 2,
family = font_gs
+
) labs(
x = "First Principal Component", y = "Second Principal Component",
title = "CISA KEV K-means Clusters Visualized with PCA"
+
) theme_ipsum_gs(grid="XY")
$group <- as.factor(cluster_assignments)
kev
count(kev, group)
group n
1 1 310
2 2 40
3 3 31
4 4 129
5 5 70
6 6 163
7 7 74
8 8 102
9 9 125
10 10 99
Yeah, that’s likely not gonna be great, but I’ll leave y’all to experiment (hint: 3
seems to produce good clusters but I also think we may need more inputs than just the embeddings we’ve got as well).
(Fire up Ollama for this next bit.)
Since we do have some clusters, why not let our LLM overlords tell us what the entries have in common. We’ll make a helper function to shoot a pre-made prompt to Ollama that takes in the descriptions of a group of KEV vulns:
<- \(descriptions) {
guess_theme paste0(descriptions, collapse = "\n") |>
sprintf(
"(Describe the common thread of these vulnerabilities. Only reply with one sentence and no other commentary. Use only the vulnerability descriptions to derive the answer.
r ```
%s
```)", x=_
|>
) ollama_generate(
prompt = _
|>
) getElement("response")
}
And, now we can ask it to do this work for us while we grab a caffeinated beverage:
|>
kev arrange(group) |>
group_by(group) |>
summarise(
theme = guess_theme(shortDescription)
-> themes )
|>
themes pull(theme) |>
stri_replace_all_regex("[[:space:]]", " ") |>
sprintf("- %s", x=_) |>
cat(sep="\n\n")
Here is the common thread of these vulnerabilities in one sentence: Many of these vulnerabilities involve authentication bypasses, code injection, or improper access controls, allowing attackers to execute malicious code, read sensitive information, or gain unauthorized access to systems and data.
All of these vulnerabilities can allow for remote code execution or arbitrary code execution in the context of the current user, which suggests a common thread of untrusted data being deserialized, validated, or handled incorrectly, ultimately leading to code execution.
The common thread among these vulnerabilities is the potential for remote code execution, either directly or indirectly.
The common thread among these vulnerabilities is unauthorized access or execution, often achieved through unauthenticated or authenticated attacks exploiting command injection, buffer overflow, privilege escalation, or other issues in various network devices and systems, including routers, firewalls, VPNs, ADCs, and web user interfaces.
The common thread of these vulnerabilities is improper handling of memory or resources, leading to privilege escalation, information disclosure, or denial-of-service conditions.
The common thread among these vulnerabilities is the potential for remote code execution, which can be achieved through various means such as uploading malicious files, injecting commands or code, or exploiting weaknesses in configuration settings and resource management. (Cloud Foundation)
The common thread of these vulnerabilities is that they all affect Apple products and services, including iOS, iPadOS, macOS, tvOS, watchOS, and Safari WebKit, often impacting HTML parsers or allowing for code execution when processing maliciously crafted web content.
The common thread among these vulnerabilities is the improper handling of objects in memory, which can lead to remote code execution, allowing an attacker to take control of the affected system.
The common thread among these vulnerabilities is the existence of privilege escalation flaws in various components and services of Microsoft Windows, allowing attackers to elevate their privileges from low-integrity to medium-integrity or higher, potentially resulting in unauthorized access, control, or data manipulation.
These vulnerabilities all involve exploitation of heap corruption or memory corruption through crafted HTML pages, leading to potential code execution, sandbox escapes, or denial-of-service attacks.
They might not be bad groups after all.
Try using other Ollama models (or $ ones online) and/or modifying the prompt to see how that impacts the results.
Experiment!
There are some decent building blocks here for using the DuckDB VSS extension. Never be hesitant to experiment. Tweak the features, change-up models, try other clustering methods.