11import z from "zod"
2- import { Effect , Scope } from "effect"
2+ import { Effect , Option , Scope } from "effect"
33import { createReadStream } from "fs"
4- import { open } from "fs/promises"
54import * as path from "path"
65import { createInterface } from "readline"
76import * as Tool from "./tool"
@@ -11,12 +10,14 @@ import DESCRIPTION from "./read.txt"
1110import { Instance } from "../project/instance"
1211import { assertExternalDirectoryEffect } from "./external-directory"
1312import { Instruction } from "../session/instruction"
13+ import { isImageAttachment , isPdfAttachment , sniffAttachmentMime } from "@/util/media"
1414
1515const DEFAULT_READ_LIMIT = 2000
1616const MAX_LINE_LENGTH = 2000
1717const MAX_LINE_SUFFIX = `... (line truncated to ${ MAX_LINE_LENGTH } chars)`
1818const MAX_BYTES = 50 * 1024
1919const MAX_BYTES_LABEL = `${ MAX_BYTES / 1024 } KB`
20+ const SAMPLE_BYTES = 4096
2021
2122const parameters = z . object ( {
2223 filePath : z . string ( ) . describe ( "The absolute path to the file or directory to read" ) ,
@@ -77,6 +78,64 @@ export const ReadTool = Tool.define(
7778 yield * lsp . touchFile ( filepath , false ) . pipe ( Effect . ignore , Effect . forkIn ( scope ) )
7879 } )
7980
81+ const readSample = Effect . fn ( "ReadTool.readSample" ) ( function * ( filepath : string , fileSize : number , sampleSize : number ) {
82+ if ( fileSize === 0 ) return new Uint8Array ( )
83+
84+ return yield * Effect . scoped (
85+ Effect . gen ( function * ( ) {
86+ const file = yield * fs . open ( filepath , { flag : "r" } )
87+ return Option . getOrElse ( yield * file . readAlloc ( Math . min ( sampleSize , fileSize ) ) , ( ) => new Uint8Array ( ) )
88+ } ) ,
89+ )
90+ } )
91+
92+ const isBinaryFile = ( filepath : string , bytes : Uint8Array ) => {
93+ const ext = path . extname ( filepath ) . toLowerCase ( )
94+ switch ( ext ) {
95+ case ".zip" :
96+ case ".tar" :
97+ case ".gz" :
98+ case ".exe" :
99+ case ".dll" :
100+ case ".so" :
101+ case ".class" :
102+ case ".jar" :
103+ case ".war" :
104+ case ".7z" :
105+ case ".doc" :
106+ case ".docx" :
107+ case ".xls" :
108+ case ".xlsx" :
109+ case ".ppt" :
110+ case ".pptx" :
111+ case ".odt" :
112+ case ".ods" :
113+ case ".odp" :
114+ case ".bin" :
115+ case ".dat" :
116+ case ".obj" :
117+ case ".o" :
118+ case ".a" :
119+ case ".lib" :
120+ case ".wasm" :
121+ case ".pyc" :
122+ case ".pyo" :
123+ return true
124+ }
125+
126+ if ( bytes . length === 0 ) return false
127+
128+ let nonPrintableCount = 0
129+ for ( let i = 0 ; i < bytes . length ; i ++ ) {
130+ if ( bytes [ i ] === 0 ) return true
131+ if ( bytes [ i ] < 9 || ( bytes [ i ] > 13 && bytes [ i ] < 32 ) ) {
132+ nonPrintableCount ++
133+ }
134+ }
135+
136+ return nonPrintableCount / bytes . length > 0.3
137+ }
138+
80139 const run = Effect . fn ( "ReadTool.execute" ) ( function * ( params : z . infer < typeof parameters > , ctx : Tool . Context ) {
81140 if ( params . offset !== undefined && params . offset < 1 ) {
82141 return yield * Effect . fail ( new Error ( "offset must be greater than or equal to 1" ) )
@@ -141,12 +200,12 @@ export const ReadTool = Tool.define(
141200 }
142201
143202 const loaded = yield * instruction . resolve ( ctx . messages , filepath , ctx . messageID )
203+ const sample = yield * readSample ( filepath , Number ( stat . size ) , SAMPLE_BYTES )
144204
145- const mime = AppFileSystem . mimeType ( filepath )
146- const isImage = mime . startsWith ( "image/" ) && mime !== "image/svg+xml" && mime !== "image/vnd.fastbidsheet"
147- const isPdf = mime === "application/pdf"
148- if ( isImage || isPdf ) {
149- const msg = `${ isImage ? "Image" : "PDF" } read successfully`
205+ const mime = sniffAttachmentMime ( sample , AppFileSystem . mimeType ( filepath ) )
206+ if ( isImageAttachment ( mime ) || isPdfAttachment ( mime ) ) {
207+ const bytes = yield * fs . readFile ( filepath )
208+ const msg = isPdfAttachment ( mime ) ? "PDF read successfully" : "Image read successfully"
150209 return {
151210 title,
152211 output : msg ,
@@ -159,13 +218,13 @@ export const ReadTool = Tool.define(
159218 {
160219 type : "file" as const ,
161220 mime,
162- url : `data:${ mime } ;base64,${ Buffer . from ( yield * fs . readFile ( filepath ) ) . toString ( "base64" ) } ` ,
221+ url : `data:${ mime } ;base64,${ Buffer . from ( bytes ) . toString ( "base64" ) } ` ,
163222 } ,
164223 ] ,
165224 }
166225 }
167226
168- if ( yield * Effect . promise ( ( ) => isBinaryFile ( filepath , Number ( stat . size ) ) ) ) {
227+ if ( isBinaryFile ( filepath , sample ) ) {
169228 return yield * Effect . fail ( new Error ( `Cannot read binary file: ${ filepath } ` ) )
170229 }
171230
@@ -261,63 +320,3 @@ async function lines(filepath: string, opts: { limit: number; offset: number })
261320
262321 return { raw, count, cut, more, offset : opts . offset }
263322}
264-
265- async function isBinaryFile ( filepath : string , fileSize : number ) : Promise < boolean > {
266- const ext = path . extname ( filepath ) . toLowerCase ( )
267- // binary check for common non-text extensions
268- switch ( ext ) {
269- case ".zip" :
270- case ".tar" :
271- case ".gz" :
272- case ".exe" :
273- case ".dll" :
274- case ".so" :
275- case ".class" :
276- case ".jar" :
277- case ".war" :
278- case ".7z" :
279- case ".doc" :
280- case ".docx" :
281- case ".xls" :
282- case ".xlsx" :
283- case ".ppt" :
284- case ".pptx" :
285- case ".odt" :
286- case ".ods" :
287- case ".odp" :
288- case ".bin" :
289- case ".dat" :
290- case ".obj" :
291- case ".o" :
292- case ".a" :
293- case ".lib" :
294- case ".wasm" :
295- case ".pyc" :
296- case ".pyo" :
297- return true
298- default :
299- break
300- }
301-
302- if ( fileSize === 0 ) return false
303-
304- const fh = await open ( filepath , "r" )
305- try {
306- const sampleSize = Math . min ( 4096 , fileSize )
307- const bytes = Buffer . alloc ( sampleSize )
308- const result = await fh . read ( bytes , 0 , sampleSize , 0 )
309- if ( result . bytesRead === 0 ) return false
310-
311- let nonPrintableCount = 0
312- for ( let i = 0 ; i < result . bytesRead ; i ++ ) {
313- if ( bytes [ i ] === 0 ) return true
314- if ( bytes [ i ] < 9 || ( bytes [ i ] > 13 && bytes [ i ] < 32 ) ) {
315- nonPrintableCount ++
316- }
317- }
318- // If >30% non-printable characters, consider it binary
319- return nonPrintableCount / result . bytesRead > 0.3
320- } finally {
321- await fh . close ( )
322- }
323- }
0 commit comments