From 6795b09d0a84f460d2ce673dff454c2c8c8bacd1 Mon Sep 17 00:00:00 2001 From: Shane Smiskol Date: Wed, 4 Mar 2026 03:16:29 -0800 Subject: [PATCH] file_downloader: stream downloads in a single HTTP request (#37549) The Python file downloader was making a separate HTTP Range request per 1MB chunk via URLFile.read(), causing massive latency overhead. Use a single streaming GET request instead, matching the old C++ behavior. Co-authored-by: Claude Opus 4.6 --- tools/lib/file_downloader.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tools/lib/file_downloader.py b/tools/lib/file_downloader.py index c9c26bb307..5b31a5894c 100755 --- a/tools/lib/file_downloader.py +++ b/tools/lib/file_downloader.py @@ -60,8 +60,16 @@ def cmd_download(args): return try: - uf = URLFile(url, cache=False) - total = uf.get_length() + # Stream the file in a single HTTP request instead of making + # a separate Range request per chunk (which was very slow). + pool = URLFile.pool_manager() + r = pool.request("GET", url, preload_content=False) + if r.status not in (200, 206): + sys.stderr.write(f"ERROR:HTTP {r.status}\n") + sys.stderr.flush() + sys.exit(1) + + total = int(r.headers.get('content-length', 0)) if total <= 0: sys.stderr.write("ERROR:File not found or empty\n") sys.stderr.flush() @@ -73,8 +81,7 @@ def cmd_download(args): downloaded = 0 chunk_size = 1024 * 1024 with os.fdopen(tmp_fd, 'wb') as f: - while downloaded < total: - data = uf.read(min(chunk_size, total - downloaded)) + for data in r.stream(chunk_size): f.write(data) downloaded += len(data) sys.stderr.write(f"PROGRESS:{downloaded}:{total}\n") @@ -91,6 +98,8 @@ def cmd_download(args): except OSError: pass raise + finally: + r.release_conn() except Exception as e: sys.stderr.write(f"ERROR:{e}\n")