Skip to content

Commit 7a5db25

Browse files
authored
feat(safari): localstorage extraction (#582)
* feat(safari): localstorage extraction Extracts Safari 17+ localStorage from WebKit's nested layout — WebsiteDataStore/<uuid>/Origins/<top-hash>/<frame-hash>/LocalStorage/ localstorage.sqlite3 for named profiles, WebsiteData/Default for the default profile. Parses the binary SecurityOrigin serialization (length-prefixed scheme+host plus 0x00 default-port or 0x01 <uint16_le> explicit-port section) and decodes UTF-16 LE ItemTable value BLOBs, capping oversized values at 2048 bytes to match the Chromium extractor. Reports the frame origin URL so partitioned third-party storage is attributed to the iframe origin JavaScript actually sees. Closes the remaining LocalStorage checkbox in #565. * docs(safari): add RFC-011 data storage Documents Safari's profile structure, per-category file layouts, and storage formats including the Safari 17+ nested WebKit Origins localStorage layout and binary SecurityOrigin serialization. Defers Keychain credential extraction to RFC-006 §7 and notes the cross-browser differences (plaintext cookies, plist bookmarks/downloads, Core Data epoch timestamps, partitioned storage). * fix(safari): latin-1 origin decoding, NULL key skip, count fast-path - Decode originEncASCII via decodeLatin1 so high-byte records preserve their ISO-8859-1 meaning instead of being interpreted as UTF-8. Matches the pattern in chromium/extract_storage.go. - Skip ItemTable rows where key is NULL — SQLite's UNIQUE constraint permits multiple NULLs, and silently lowering them to empty strings would collide with legitimate empty-string keys. - countLocalStorage now walks origin dirs and runs SELECT COUNT(key) per localstorage.sqlite3 instead of fully decoding every value. COUNT(key) naturally excludes NULLs, keeping count and extract symmetric. Addresses Copilot review feedback on #582. * fix(safari): round-2 review — WAL replay, stable ordering, error context - Drop immutable=1 on temp-copy SQLite opens in readLocalStorageFile / countLocalStorageFile. Session.Acquire copies the -wal / -shm sidecars, so mode=ro alone lets SQLite replay WAL on the ephemeral copy and surface entries Safari committed to WAL but hasn't checkpointed yet. Live-file reads in profiles.go keep immutable=1 as before. - Order ItemTable query by (key, rowid) for deterministic exports across runs and SQLite versions. - Wrap os.ReadFile / os.ReadDir errors with the offending path so multi-origin debug logs stay scannable. - RFC-011 §7 rewritten to explain the live-vs-temp split. - New regression test asserts ORDER BY surfaces rows in key order. Addresses round-2 Copilot review on #582.
1 parent d75738b commit 7a5db25

7 files changed

Lines changed: 1157 additions & 11 deletions

File tree

browser/safari/extract_storage.go

Lines changed: 344 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,344 @@
1+
package safari
2+
3+
import (
4+
"database/sql"
5+
"encoding/binary"
6+
"fmt"
7+
"os"
8+
"path/filepath"
9+
"unicode/utf16"
10+
11+
_ "modernc.org/sqlite"
12+
13+
"github.com/moond4rk/hackbrowserdata/log"
14+
"github.com/moond4rk/hackbrowserdata/types"
15+
)
16+
17+
// Modern WebKit (Safari 17+) stores localStorage under a nested, partitioned layout rooted at
18+
// either WebsiteDataStore/<uuid>/Origins (per named profile) or WebsiteData/Default
19+
// (the pre-profile default store). Within that root:
20+
//
21+
// <root>/<top-frame-hash>/<frame-hash>/origin — binary; encodes top+frame origins
22+
// <root>/<top-frame-hash>/<frame-hash>/LocalStorage/localstorage.sqlite3
23+
//
24+
// top-hash == frame-hash ⇒ first-party; they differ for third-party partitioned storage.
25+
// We report the frame origin because that's what window.localStorage exposes to JS.
26+
// ItemTable: (key TEXT UNIQUE ON CONFLICT REPLACE, value BLOB NOT NULL ON CONFLICT FAIL);
27+
// value BLOBs are UTF-16 LE strings.
28+
//
29+
// The flat "LocalStorage/<scheme>_<host>_<port>.localstorage" directory that older builds used
30+
// is empty on current Safari and is no longer a supported source.
31+
32+
const (
33+
webkitOriginFile = "origin"
34+
webkitLocalStorageSubdir = "LocalStorage"
35+
webkitLocalStorageDB = "localstorage.sqlite3"
36+
webkitOriginSaltName = "salt" // HMAC salt sibling of the <hash> dirs; not a data dir
37+
38+
maxLocalStorageValueLength = 2048
39+
)
40+
41+
// origin file encoding-byte constants (WebCore SecurityOrigin serialization).
42+
const (
43+
originEncASCII = 0x01 // Latin-1 / ASCII
44+
originEncUTF16 = 0x00 // UTF-16 LE
45+
)
46+
47+
// Port marker values after the (scheme, host) pair in an origin block.
48+
// 0x00 → port is the scheme default (stored as 0).
49+
// 0x01 → next two bytes are a uint16_le port.
50+
const (
51+
originPortDefaultMarker = 0x00
52+
originPortExplicitFlag = 0x01
53+
)
54+
55+
func extractLocalStorage(root string) ([]types.StorageEntry, error) {
56+
dirs, err := findOriginDataDirs(root)
57+
if err != nil {
58+
return nil, err
59+
}
60+
61+
var entries []types.StorageEntry
62+
for _, od := range dirs {
63+
origin, err := readOriginFile(filepath.Join(od, webkitOriginFile))
64+
if err != nil {
65+
log.Debugf("safari localstorage: origin %s: %v", od, err)
66+
continue
67+
}
68+
dbPath := filepath.Join(od, webkitLocalStorageSubdir, webkitLocalStorageDB)
69+
items, err := readLocalStorageFile(dbPath)
70+
if err != nil {
71+
log.Debugf("safari localstorage: db %s: %v", dbPath, err)
72+
continue
73+
}
74+
for _, it := range items {
75+
entries = append(entries, types.StorageEntry{
76+
URL: origin,
77+
Key: it.key,
78+
Value: it.value,
79+
})
80+
}
81+
}
82+
return entries, nil
83+
}
84+
85+
// countLocalStorage sums ItemTable row counts across every origin DB under root without
86+
// parsing origin files or decoding values — CountEntries callers only need the total, not the
87+
// URLs or plaintext. COUNT(key) naturally excludes NULL keys, matching the same skip rule
88+
// applied by readLocalStorageFile, so count and extract stay in sync.
89+
func countLocalStorage(root string) (int, error) {
90+
dirs, err := findOriginDataDirs(root)
91+
if err != nil {
92+
return 0, err
93+
}
94+
total := 0
95+
for _, od := range dirs {
96+
dbPath := filepath.Join(od, webkitLocalStorageSubdir, webkitLocalStorageDB)
97+
n, err := countLocalStorageFile(dbPath)
98+
if err != nil {
99+
log.Debugf("safari localstorage: count %s: %v", dbPath, err)
100+
continue
101+
}
102+
total += n
103+
}
104+
return total, nil
105+
}
106+
107+
func countLocalStorageFile(path string) (int, error) {
108+
// mode=ro (no immutable) so SQLite replays the copied -wal sidecar — this surfaces entries
109+
// Safari has committed to WAL but not yet checkpointed to the main DB. Writes SQLite might
110+
// make to the temp-copy's -shm during replay are harmless; the Session cleanup removes
111+
// everything. Live-file reads (profiles.go) still use immutable=1 to stay off the real WAL.
112+
dsn := "file:" + path + "?mode=ro"
113+
db, err := sql.Open("sqlite", dsn)
114+
if err != nil {
115+
return 0, fmt.Errorf("open %s: %w", path, err)
116+
}
117+
defer db.Close()
118+
if err := db.Ping(); err != nil {
119+
return 0, fmt.Errorf("ping %s: %w", path, err)
120+
}
121+
var count int
122+
if err := db.QueryRow(`SELECT COUNT(key) FROM ItemTable`).Scan(&count); err != nil {
123+
return 0, fmt.Errorf("count ItemTable: %w", err)
124+
}
125+
return count, nil
126+
}
127+
128+
// findOriginDataDirs returns <root>/<h1>/<h2>/ paths that contain both an "origin" file and
129+
// a "LocalStorage/localstorage.sqlite3" database. Non-directory entries, the "salt" sibling,
130+
// and partition dirs without localStorage data are silently skipped.
131+
func findOriginDataDirs(root string) ([]string, error) {
132+
topEntries, err := os.ReadDir(root)
133+
if err != nil {
134+
return nil, fmt.Errorf("read origins root %s: %w", root, err)
135+
}
136+
var out []string
137+
for _, top := range topEntries {
138+
if !top.IsDir() || top.Name() == webkitOriginSaltName {
139+
continue
140+
}
141+
topPath := filepath.Join(root, top.Name())
142+
frameEntries, err := os.ReadDir(topPath)
143+
if err != nil {
144+
continue
145+
}
146+
for _, frame := range frameEntries {
147+
if !frame.IsDir() {
148+
continue
149+
}
150+
framePath := filepath.Join(topPath, frame.Name())
151+
if _, err := os.Stat(filepath.Join(framePath, webkitOriginFile)); err != nil {
152+
continue
153+
}
154+
dbPath := filepath.Join(framePath, webkitLocalStorageSubdir, webkitLocalStorageDB)
155+
if _, err := os.Stat(dbPath); err != nil {
156+
continue
157+
}
158+
out = append(out, framePath)
159+
}
160+
}
161+
return out, nil
162+
}
163+
164+
// originEndpoint is one half of an origin file (top-frame or frame). Port 0 means the scheme
165+
// default (443 for https, 80 for http) and is omitted from the URL rendering.
166+
type originEndpoint struct {
167+
scheme string
168+
host string
169+
port uint16
170+
}
171+
172+
// readOriginFile parses WebKit's SecurityOrigin binary serialization and returns the frame
173+
// origin URL (scheme://host[:port]). The file holds two origin blocks back-to-back: top-frame
174+
// then frame. When the frame block is missing/unreadable we fall back to the top-frame so we
175+
// can still attribute the data to *something* meaningful.
176+
func readOriginFile(path string) (string, error) {
177+
data, err := os.ReadFile(path)
178+
if err != nil {
179+
return "", fmt.Errorf("read origin file %s: %w", path, err)
180+
}
181+
top, pos, terr := readOriginBlock(data, 0)
182+
if terr != nil {
183+
return "", fmt.Errorf("parse top-frame origin: %w", terr)
184+
}
185+
frame, _, ferr := readOriginBlock(data, pos)
186+
if ferr != nil {
187+
// Partitioned info unavailable — attribute to the top-frame origin.
188+
frame = top
189+
}
190+
if frame.scheme == "" || frame.host == "" {
191+
return "", fmt.Errorf("origin file missing scheme/host")
192+
}
193+
return formatOriginURL(frame), nil
194+
}
195+
196+
// readOriginBlock reads one origin block: scheme record, host record, port marker.
197+
// Returns the parsed endpoint and the byte offset immediately after the block.
198+
func readOriginBlock(data []byte, pos int) (originEndpoint, int, error) {
199+
var ep originEndpoint
200+
var err error
201+
ep.scheme, pos, err = readOriginString(data, pos)
202+
if err != nil {
203+
return ep, pos, err
204+
}
205+
ep.host, pos, err = readOriginString(data, pos)
206+
if err != nil {
207+
return ep, pos, err
208+
}
209+
if pos >= len(data) {
210+
return ep, pos, fmt.Errorf("unexpected EOF before port marker")
211+
}
212+
marker := data[pos]
213+
pos++
214+
switch marker {
215+
case originPortDefaultMarker:
216+
ep.port = 0
217+
case originPortExplicitFlag:
218+
if pos+2 > len(data) {
219+
return ep, pos, fmt.Errorf("truncated port value at offset %d", pos)
220+
}
221+
ep.port = binary.LittleEndian.Uint16(data[pos : pos+2])
222+
pos += 2
223+
default:
224+
return ep, pos, fmt.Errorf("unexpected port marker 0x%02x at offset %d", marker, pos-1)
225+
}
226+
return ep, pos, nil
227+
}
228+
229+
// readOriginString consumes one length-prefixed record (uint32_le length + encoding byte + data).
230+
func readOriginString(data []byte, pos int) (string, int, error) {
231+
if pos+5 > len(data) {
232+
return "", pos, fmt.Errorf("truncated string record at offset %d", pos)
233+
}
234+
length := int(binary.LittleEndian.Uint32(data[pos : pos+4]))
235+
enc := data[pos+4]
236+
pos += 5
237+
if length < 0 || pos+length > len(data) {
238+
return "", pos, fmt.Errorf("string record overruns buffer: length %d at offset %d", length, pos-5)
239+
}
240+
chunk := data[pos : pos+length]
241+
pos += length
242+
switch enc {
243+
case originEncASCII:
244+
return decodeLatin1(chunk), pos, nil
245+
case originEncUTF16:
246+
return decodeUTF16LE(chunk), pos, nil
247+
default:
248+
return decodeLatin1(chunk), pos, nil
249+
}
250+
}
251+
252+
// decodeLatin1 converts ISO-8859-1 bytes to a valid UTF-8 Go string. Latin-1 byte values map
253+
// 1:1 to Unicode code points U+0000–U+00FF. Mirrors the helper in chromium/extract_storage.go.
254+
func decodeLatin1(b []byte) string {
255+
runes := make([]rune, len(b))
256+
for i, c := range b {
257+
runes[i] = rune(c)
258+
}
259+
return string(runes)
260+
}
261+
262+
func formatOriginURL(ep originEndpoint) string {
263+
url := ep.scheme + "://" + ep.host
264+
if ep.port != 0 {
265+
url += fmt.Sprintf(":%d", ep.port)
266+
}
267+
return url
268+
}
269+
270+
type localStorageItem struct {
271+
key string
272+
value string
273+
}
274+
275+
func readLocalStorageFile(path string) ([]localStorageItem, error) {
276+
// mode=ro (no immutable) — see countLocalStorageFile for the WAL-replay rationale; the same
277+
// live-vs-temp split applies here. ORDER BY key, rowid makes exports byte-for-byte stable
278+
// across runs and SQLite versions.
279+
dsn := "file:" + path + "?mode=ro"
280+
db, err := sql.Open("sqlite", dsn)
281+
if err != nil {
282+
return nil, fmt.Errorf("open %s: %w", path, err)
283+
}
284+
defer db.Close()
285+
if err := db.Ping(); err != nil {
286+
return nil, fmt.Errorf("ping %s: %w", path, err)
287+
}
288+
289+
rows, err := db.Query(`SELECT key, value FROM ItemTable ORDER BY key, rowid`)
290+
if err != nil {
291+
return nil, fmt.Errorf("query ItemTable: %w", err)
292+
}
293+
defer rows.Close()
294+
295+
var items []localStorageItem
296+
for rows.Next() {
297+
var key sql.NullString
298+
var value []byte
299+
if err := rows.Scan(&key, &value); err != nil {
300+
log.Debugf("safari localstorage: scan row in %s: %v", path, err)
301+
continue
302+
}
303+
if !key.Valid {
304+
// NULL keys would collide with legitimate empty-string keys in the output and are
305+
// not meaningful localStorage entries. The UNIQUE constraint in ItemTable still
306+
// permits multiple NULL rows in SQLite, so we filter them here.
307+
log.Debugf("safari localstorage: skip row with NULL key in %s", path)
308+
continue
309+
}
310+
items = append(items, localStorageItem{
311+
key: key.String,
312+
value: decodeLocalStorageValue(value),
313+
})
314+
}
315+
return items, rows.Err()
316+
}
317+
318+
// decodeLocalStorageValue treats the BLOB as UTF-16 LE. Values at or above the cap are replaced
319+
// with a size marker to keep JSON/CSV output bounded, matching chromium/extract_storage.go.
320+
func decodeLocalStorageValue(b []byte) string {
321+
if len(b) >= maxLocalStorageValueLength {
322+
return fmt.Sprintf(
323+
"value is too long, length is %d, supported max length is %d",
324+
len(b), maxLocalStorageValueLength,
325+
)
326+
}
327+
return decodeUTF16LE(b)
328+
}
329+
330+
// decodeUTF16LE returns the input as a Go string on odd-length (malformed) inputs; WebKit values
331+
// are always even-length in practice but we don't want a stray byte to drop a whole row.
332+
func decodeUTF16LE(b []byte) string {
333+
if len(b) == 0 {
334+
return ""
335+
}
336+
if len(b)%2 != 0 {
337+
return string(b)
338+
}
339+
u16 := make([]uint16, len(b)/2)
340+
for i := range u16 {
341+
u16[i] = binary.LittleEndian.Uint16(b[i*2:])
342+
}
343+
return string(utf16.Decode(u16))
344+
}

0 commit comments

Comments
 (0)