Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
290 changes: 290 additions & 0 deletions browser/safari/extract_storage.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
package safari

import (
"database/sql"
"encoding/binary"
"fmt"
"os"
"path/filepath"
"unicode/utf16"

_ "modernc.org/sqlite"

"github.com/moond4rk/hackbrowserdata/log"
"github.com/moond4rk/hackbrowserdata/types"
)

// Modern WebKit (Safari 17+) stores localStorage under a nested, partitioned layout rooted at
// either WebsiteDataStore/<uuid>/Origins (per named profile) or WebsiteData/Default
// (the pre-profile default store). Within that root:
//
// <root>/<top-frame-hash>/<frame-hash>/origin — binary; encodes top+frame origins
// <root>/<top-frame-hash>/<frame-hash>/LocalStorage/localstorage.sqlite3
//
// top-hash == frame-hash ⇒ first-party; they differ for third-party partitioned storage.
// We report the frame origin because that's what window.localStorage exposes to JS.
// ItemTable: (key TEXT UNIQUE ON CONFLICT REPLACE, value BLOB NOT NULL ON CONFLICT FAIL);
// value BLOBs are UTF-16 LE strings.
//
// The flat "LocalStorage/<scheme>_<host>_<port>.localstorage" directory that older builds used
// is empty on current Safari and is no longer a supported source.

const (
webkitOriginFile = "origin"
webkitLocalStorageSubdir = "LocalStorage"
webkitLocalStorageDB = "localstorage.sqlite3"
webkitOriginSaltName = "salt" // HMAC salt sibling of the <hash> dirs; not a data dir

maxLocalStorageValueLength = 2048
)

// origin file encoding-byte constants (WebCore SecurityOrigin serialization).
const (
originEncASCII = 0x01 // Latin-1 / ASCII
originEncUTF16 = 0x00 // UTF-16 LE
)

// Port marker values after the (scheme, host) pair in an origin block.
// 0x00 → port is the scheme default (stored as 0).
// 0x01 → next two bytes are a uint16_le port.
const (
originPortDefaultMarker = 0x00
originPortExplicitFlag = 0x01
)

func extractLocalStorage(root string) ([]types.StorageEntry, error) {
dirs, err := findOriginDataDirs(root)
if err != nil {
return nil, err
}

var entries []types.StorageEntry
for _, od := range dirs {
origin, err := readOriginFile(filepath.Join(od, webkitOriginFile))
if err != nil {
log.Debugf("safari localstorage: origin %s: %v", od, err)
continue
}
dbPath := filepath.Join(od, webkitLocalStorageSubdir, webkitLocalStorageDB)
items, err := readLocalStorageFile(dbPath)
if err != nil {
log.Debugf("safari localstorage: db %s: %v", dbPath, err)
continue
}
for _, it := range items {
entries = append(entries, types.StorageEntry{
URL: origin,
Key: it.key,
Value: it.value,
})
}
}
return entries, nil
}

func countLocalStorage(root string) (int, error) {
entries, err := extractLocalStorage(root)
if err != nil {
return 0, err
}
return len(entries), nil
}

Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

countLocalStorage currently calls extractLocalStorage and decodes all values just to compute a count. For large Origins trees this can be significantly slower than necessary during CountEntries. Consider counting without value decoding (e.g., traverse origin dirs and run SELECT COUNT(*) per localstorage.sqlite3, summing results).

Suggested change
entries, err := extractLocalStorage(root)
if err != nil {
return 0, err
}
return len(entries), nil
}
dirs, err := findOriginDataDirs(root)
if err != nil {
return 0, err
}
total := 0
for _, od := range dirs {
dbPath := filepath.Join(od, webkitLocalStorageSubdir, webkitLocalStorageDB)
n, err := countLocalStorageFile(dbPath)
if err != nil {
log.Debugf("safari localstorage: db %s: %v", dbPath, err)
continue
}
total += n
}
return total, nil
}
func countLocalStorageFile(dbPath string) (int, error) {
db, err := sql.Open("sqlite", dbPath)
if err != nil {
return 0, err
}
defer db.Close()
var count int
if err := db.QueryRow(`SELECT COUNT(*) FROM ItemTable`).Scan(&count); err != nil {
return 0, err
}
return count, nil
}

Copilot uses AI. Check for mistakes.
// findOriginDataDirs returns <root>/<h1>/<h2>/ paths that contain both an "origin" file and
// a "LocalStorage/localstorage.sqlite3" database. Non-directory entries, the "salt" sibling,
// and partition dirs without localStorage data are silently skipped.
func findOriginDataDirs(root string) ([]string, error) {
topEntries, err := os.ReadDir(root)
if err != nil {
return nil, fmt.Errorf("read origins root: %w", err)
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error returned here doesn’t include the root path, which makes it harder to trace which profile/store failed when multiple Origins roots are processed. Including root in the formatted message would improve diagnostics.

Suggested change
return nil, fmt.Errorf("read origins root: %w", err)
return nil, fmt.Errorf("read origins root %s: %w", root, err)

Copilot uses AI. Check for mistakes.
}
var out []string
for _, top := range topEntries {
if !top.IsDir() || top.Name() == webkitOriginSaltName {
continue
}
topPath := filepath.Join(root, top.Name())
frameEntries, err := os.ReadDir(topPath)
if err != nil {
continue
}
for _, frame := range frameEntries {
if !frame.IsDir() {
continue
}
framePath := filepath.Join(topPath, frame.Name())
if _, err := os.Stat(filepath.Join(framePath, webkitOriginFile)); err != nil {
continue
}
dbPath := filepath.Join(framePath, webkitLocalStorageSubdir, webkitLocalStorageDB)
if _, err := os.Stat(dbPath); err != nil {
continue
}
out = append(out, framePath)
}
}
return out, nil
}

// originEndpoint is one half of an origin file (top-frame or frame). Port 0 means the scheme
// default (443 for https, 80 for http) and is omitted from the URL rendering.
type originEndpoint struct {
scheme string
host string
port uint16
}

// readOriginFile parses WebKit's SecurityOrigin binary serialization and returns the frame
// origin URL (scheme://host[:port]). The file holds two origin blocks back-to-back: top-frame
// then frame. When the frame block is missing/unreadable we fall back to the top-frame so we
// can still attribute the data to *something* meaningful.
func readOriginFile(path string) (string, error) {
data, err := os.ReadFile(path)
if err != nil {
return "", err
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

readOriginFile returns the raw os.ReadFile error without path context, which makes debug logs harder to interpret. Wrap the error with the filename (e.g. fmt.Errorf("read origin file %s: %w", path, err)) for consistent diagnostics with the rest of this package.

Suggested change
return "", err
return "", fmt.Errorf("read origin file %s: %w", path, err)

Copilot uses AI. Check for mistakes.
}
top, pos, terr := readOriginBlock(data, 0)
if terr != nil {
return "", fmt.Errorf("parse top-frame origin: %w", terr)
}
frame, _, ferr := readOriginBlock(data, pos)
if ferr != nil {
// Partitioned info unavailable — attribute to the top-frame origin.
frame = top
}
if frame.scheme == "" || frame.host == "" {
return "", fmt.Errorf("origin file missing scheme/host")
}
return formatOriginURL(frame), nil
}

// readOriginBlock reads one origin block: scheme record, host record, port marker.
// Returns the parsed endpoint and the byte offset immediately after the block.
func readOriginBlock(data []byte, pos int) (originEndpoint, int, error) {
var ep originEndpoint
var err error
ep.scheme, pos, err = readOriginString(data, pos)
if err != nil {
return ep, pos, err
}
ep.host, pos, err = readOriginString(data, pos)
if err != nil {
return ep, pos, err
}
if pos >= len(data) {
return ep, pos, fmt.Errorf("unexpected EOF before port marker")
}
marker := data[pos]
pos++
switch marker {
case originPortDefaultMarker:
ep.port = 0
case originPortExplicitFlag:
if pos+2 > len(data) {
return ep, pos, fmt.Errorf("truncated port value at offset %d", pos)
}
ep.port = binary.LittleEndian.Uint16(data[pos : pos+2])
pos += 2
default:
return ep, pos, fmt.Errorf("unexpected port marker 0x%02x at offset %d", marker, pos-1)
}
return ep, pos, nil
}

// readOriginString consumes one length-prefixed record (uint32_le length + encoding byte + data).
func readOriginString(data []byte, pos int) (string, int, error) {
if pos+5 > len(data) {
return "", pos, fmt.Errorf("truncated string record at offset %d", pos)
}
length := int(binary.LittleEndian.Uint32(data[pos : pos+4]))
enc := data[pos+4]
pos += 5
if length < 0 || pos+length > len(data) {
return "", pos, fmt.Errorf("string record overruns buffer: length %d at offset %d", length, pos-5)
}
chunk := data[pos : pos+length]
pos += length
switch enc {
case originEncASCII:
return string(chunk), pos, nil
case originEncUTF16:
return decodeUTF16LE(chunk), pos, nil
Comment on lines +243 to +246
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

originEncASCII is documented as Latin-1/ASCII, but this branch converts bytes with string(chunk) (UTF-8). If WebKit stores Latin-1 here, non-ASCII bytes will be mis-decoded. Consider decoding ISO-8859-1 (similar to browser/chromium/extract_storage.go’s decodeLatin1) to match the documented encoding.

Copilot uses AI. Check for mistakes.
default:
return string(chunk), pos, nil
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The default branch silently accepts unknown origin string encodings by returning string(chunk). This can propagate corrupted scheme/host into output URLs. Consider returning an error on unknown encodings (similar to Chromium’s decodeChromiumString behavior) so the caller can skip the partition cleanly.

Copilot uses AI. Check for mistakes.
}
}

func formatOriginURL(ep originEndpoint) string {
url := ep.scheme + "://" + ep.host
if ep.port != 0 {
url += fmt.Sprintf(":%d", ep.port)
}
return url
}

type localStorageItem struct {
key string
value string
}

func readLocalStorageFile(path string) ([]localStorageItem, error) {
// Read-only + immutable so we don't disturb a live WAL (same pattern as profiles.go).
dsn := "file:" + path + "?mode=ro&immutable=1"
db, err := sql.Open("sqlite", dsn)
if err != nil {
return nil, fmt.Errorf("open %s: %w", path, err)
}
defer db.Close()
if err := db.Ping(); err != nil {
return nil, fmt.Errorf("ping %s: %w", path, err)
Comment on lines +275 to +286
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

immutable=1 disables WAL replay, but the extractor is reading from the temp copy of the Origins directory (and the repo’s filemanager.Session copies WAL/SHM). Dropping immutable=1 and keeping read-only access will make localStorage extraction include committed data that hasn’t been checkpointed yet.

Copilot uses AI. Check for mistakes.
}

rows, err := db.Query(`SELECT key, value FROM ItemTable`)
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Querying ItemTable without an ORDER BY can yield nondeterministic output ordering across runs/SQLite versions. Consider ordering by key (and/or rowid) to keep exports stable and tests/consumers deterministic.

Suggested change
rows, err := db.Query(`SELECT key, value FROM ItemTable`)
rows, err := db.Query(`SELECT key, value FROM ItemTable ORDER BY key, rowid`)

Copilot uses AI. Check for mistakes.
if err != nil {
return nil, fmt.Errorf("query ItemTable: %w", err)
}
defer rows.Close()

var items []localStorageItem
for rows.Next() {
var key sql.NullString
var value []byte
if err := rows.Scan(&key, &value); err != nil {
log.Debugf("safari localstorage: scan row in %s: %v", path, err)
continue
}
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ItemTable key is scanned into sql.NullString but key.Valid is not checked. If a row has a NULL key, it will be emitted as an empty string, potentially causing collisions and misleading output. Consider skipping rows where key.Valid is false (or logging and continuing).

Suggested change
}
}
if !key.Valid {
log.Debugf("safari localstorage: skip row with NULL key in %s", path)
continue
}

Copilot uses AI. Check for mistakes.
items = append(items, localStorageItem{
key: key.String,
value: decodeLocalStorageValue(value),
})
}
return items, rows.Err()
}

// decodeLocalStorageValue treats the BLOB as UTF-16 LE. Values at or above the cap are replaced
// with a size marker to keep JSON/CSV output bounded, matching chromium/extract_storage.go.
func decodeLocalStorageValue(b []byte) string {
if len(b) >= maxLocalStorageValueLength {
return fmt.Sprintf(
"value is too long, length is %d, supported max length is %d",
len(b), maxLocalStorageValueLength,
)
}
return decodeUTF16LE(b)
}

// decodeUTF16LE returns the input as a Go string on odd-length (malformed) inputs; WebKit values
// are always even-length in practice but we don't want a stray byte to drop a whole row.
func decodeUTF16LE(b []byte) string {
if len(b) == 0 {
return ""
}
if len(b)%2 != 0 {
return string(b)
}
u16 := make([]uint16, len(b)/2)
for i := range u16 {
u16[i] = binary.LittleEndian.Uint16(b[i*2:])
}
return string(utf16.Decode(u16))
}
Loading
Loading