From 4169ab431aad3fb52d90f94c4b4b736d7a43d3e6 Mon Sep 17 00:00:00 2001 From: No0ne558 Date: Tue, 2 Jun 2026 04:41:26 -0700 Subject: [PATCH] fix: stop printing and saving halting after 10-12 hours of runtime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three bugs combined to cause the system to stop printing and saving new orders after extended runtime. Root cause (remote_printer.cc): PrinterCB() closed the socket and set failure=999 but never called RemoveInputFn(). The Xt event loop kept polling the closed FD on every tick, spinning forever. Over 10-12 hours this CPU waste starved the main event thread. Fix: deregister the input handler before closing the socket. Secondary (printer.cc): Printer::Close() called the synchronous LPDPrint() — which shells out to lpr via system() — on the main thread. A stalled or unresponsive CUPS daemon blocked the entire event loop. Fix: Close() now routes TARGET_LPD and TARGET_SOCKET through the existing CloseAsync() thread-pool path. Tertiary (archive.cc, data_file.hh): Archive::SavePacked() did not detect write errors after the drawer and check loops. A disk-full or I/O error was silently swallowed and the archive marked as saved. Fix: added OutputDataFile::HasError() (ferror/gzerror) and check it after each critical write loop. --- docs/changelog.md | 8 ++++++++ main/data/archive.cc | 10 ++++++++++ main/hardware/printer.cc | 21 +++++++++++++-------- main/hardware/remote_printer.cc | 12 +++++++++++- src/core/data_file.hh | 11 +++++++++++ 5 files changed, 53 insertions(+), 9 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 9f2dcb5f..77a3e78c 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -6,6 +6,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ## [Unreleased] +### Fixed +- **Printing & Ticket Saving: Fix 10-12 hour runtime failure** (2026-06-02) + - After extended runtime (~10-12 hours) the system would silently stop printing and saving new orders. Three bugs combined to cause this. + - **Root cause — stale Xt input handler spin loop (`main/hardware/remote_printer.cc`):** `PrinterCB()` was closing the socket and marking the printer offline (`failure = 999`) but never calling `RemoveInputFn(p->input_id)`. The X event loop continued polling the closed file descriptor on every tick, creating an infinite error-callback spin loop. Over 10-12 hours of normal restaurant operation this CPU waste progressively starved the main event thread, causing both printing and order-saving to hang. Fix: deregister the input handler (`RemoveInputFn` + `input_id = -1`) before closing the socket in the offline-marking block. + - **Secondary — blocking `system()` call in `LPDPrint` (`main/hardware/printer.cc`):** `Printer::Close()` called the synchronous `LPDPrint()` (which runs `system("cat file | lpr -P...")`) on the main thread. If CUPS became unresponsive this blocked the entire event loop for the duration. Fix: `Close()` now delegates `TARGET_LPD` and `TARGET_SOCKET` to `CloseAsync()`, which already dispatches print jobs to a background thread pool. + - **Tertiary — silent I/O errors in archive save (`main/data/archive.cc`, `src/core/data_file.hh`):** `Archive::SavePacked()` did not check for write errors after the drawer and check write loops. A disk-full or I/O error would be silently ignored and the archive marked as successfully saved. Fix: added `OutputDataFile::HasError()` (using `ferror`/`gzerror`) and inserted checks after both write loops with appropriate `ReportError` messages and early-return on failure. + - Files modified: `main/hardware/remote_printer.cc`, `main/hardware/printer.cc`, `main/data/archive.cc`, `src/core/data_file.hh`. + ### Removed - **Button Properties Dialog: Remove redundant "Menu Type" field** (2026-05-28) - The "Menu Type" selector (`item_type` `DialogMenu` widget in `ZoneDialog`) was redundant with the more specific zone types already present in "Button's Type" (`ZONE_ITEM_NORMAL`, `ZONE_ITEM_MODIFIER`, `ZONE_ITEM_METHOD`, `ZONE_ITEM_SUBSTITUTE`, `ZONE_ITEM_POUND`, `ZONE_ITEM_ADMISSION`). The item classification (`itype`) is now derived entirely from the selected zone type; for the legacy generic `ZONE_ITEM` type the classification defaults to `ITEM_NORMAL`. The network protocol byte is preserved unchanged. diff --git a/main/data/archive.cc b/main/data/archive.cc index 4e3712f6..27dd3ea6 100644 --- a/main/data/archive.cc +++ b/main/data/archive.cc @@ -644,6 +644,11 @@ int Archive::SavePacked() drawer->Write(df, DRAWER_VERSION); drawer = drawer->next; } + if (df.HasError()) + { + ReportError("Archive::SavePacked(): I/O error writing drawers — archive may be corrupt"); + return 1; + } // Save Checks count = 0; @@ -659,6 +664,11 @@ int Archive::SavePacked() c->Write(df, CHECK_VERSION); c = c->next; } + if (df.HasError()) + { + ReportError("Archive::SavePacked(): I/O error writing checks — archive may be corrupt"); + return 1; + } // Save Tips count = 0; diff --git a/main/hardware/printer.cc b/main/hardware/printer.cc index 461d626e..9cc23105 100644 --- a/main/hardware/printer.cc +++ b/main/hardware/printer.cc @@ -258,8 +258,17 @@ int Printer::Close() { FnTrace("Printer::Close()"); - // assume we're going to do something. But only close if we're pretty - // sure we have a valid file handle + // LPD and socket printing block the calling thread (system() / connect()). + // Delegate to CloseAsync() which dispatches to a thread pool so the main + // event loop is never stalled waiting for CUPS or a network printer. + // CloseAsync() owns temp_fd/temp_name cleanup for these two types. + if (target_type == TARGET_LPD || target_type == TARGET_SOCKET) + { + CloseAsync(); + return 0; + } + + // For all other target types: close the file handle, dispatch, then clean up. if (temp_fd > 0) close(temp_fd); switch (target_type) @@ -267,18 +276,14 @@ int Printer::Close() case TARGET_PARALLEL: ParallelPrint(); break; - case TARGET_LPD: - LPDPrint(); - break; - case TARGET_SOCKET: - SocketPrint(); - break; case TARGET_FILE: FilePrint(); break; case TARGET_EMAIL: EmailPrint(); break; + default: + break; } // delete the temp file unless printing to the parallel port might still need it if (target_type != TARGET_PARALLEL) diff --git a/main/hardware/remote_printer.cc b/main/hardware/remote_printer.cc index 2d8ce027..747827cc 100644 --- a/main/hardware/remote_printer.cc +++ b/main/hardware/remote_printer.cc @@ -579,6 +579,16 @@ void PrinterCB(XtPointer client_data, int *fid, XtInputId *id) p->host_name.Value(), p->port_no, p->failure); ReportError(errmsg.data()); + // Deregister the Xt input handler BEFORE closing the socket. + // Without this, the event loop keeps polling the closed FD every tick, + // spinning in an infinite error loop that starves the main thread after + // ~10-12 hours of accumulated failures. + if (p->input_id >= 0) + { + RemoveInputFn(p->input_id); + p->input_id = -1; + } + if (p->socket_no >= 0) { // close socket here instead of letting the destructor do it @@ -587,7 +597,7 @@ void PrinterCB(XtPointer client_data, int *fid, XtInputId *id) p->socket_no = -1; } - // Critical fix: Don't kill the printer immediately, mark it for reconnection + // Don't kill the printer immediately, mark it for reconnection if (db) { // Mark printer as offline but keep it in the list for reconnection attempts diff --git a/src/core/data_file.hh b/src/core/data_file.hh index 8c73e781..f88a5c85 100644 --- a/src/core/data_file.hh +++ b/src/core/data_file.hh @@ -131,6 +131,17 @@ public: int Write(Flt *val, int bk = 0); int Write(Str *val, int bk = 0); [[nodiscard]] const std::string &FileName() const noexcept { return filename; } + // Returns true if any underlying write has failed (e.g. disk full). + [[nodiscard]] bool HasError() const noexcept + { + if (compress && gz_fp) + { + int errnum = 0; + gzerror(gz_fp, &errnum); + return errnum != Z_OK && errnum != Z_STREAM_END; + } + return file_fp && ferror(file_fp) != 0; + } }; /*********************************************************************