From 29ef1ebc345e1afb64c4a255fa917b68cafbfe17 Mon Sep 17 00:00:00 2001 From: work Date: Thu, 2 Apr 2026 13:36:36 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=8F=91=E5=B8=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- spider/task.py | 170 +++++++++++++++++++++++++++++++------------------ 1 file changed, 109 insertions(+), 61 deletions(-) diff --git a/spider/task.py b/spider/task.py index e7c8756..9729bc3 100644 --- a/spider/task.py +++ b/spider/task.py @@ -101,7 +101,7 @@ def _edit_privacy(page): continue_btn.click() sleep(1, 2) page.click('//div[@aria-label="Select audience"]//span[text()="Public"]') - page.click('//div[@aria-label="Save"]') + page.click('//div[@aria-label="Save privacy audience selection and close dialog"]') sleep(1, 2) return else: @@ -111,7 +111,7 @@ def _edit_privacy(page): page.click('//div[contains(@aria-label, "Edit privacy")]') sleep(1, 2) page.click('//div[@aria-label="Select audience"]//span[text()="Public"]') - page.click('//div[@aria-label="Done"]') + page.click('//div[@aria-label="Done with privacy audience selection and close dialog"]') sleep(1, 2) except Error as e: logger.error(f"Error editing privacy settings: {e}") @@ -225,6 +225,17 @@ def is_operation_failed(exception): return isinstance(exception, OperationFailed) and "更改语言异常" in str(exception) +def is_page_crash_error(exception: Exception) -> bool: + error_message = str(exception).lower() + crash_patterns = ( + "page crashed", + "target crashed", + "target page, context or browser has been closed", + "browser has been closed", + ) + return any(pattern in error_message for pattern in crash_patterns) + + @retry( stop=stop_after_attempt(3), wait=wait_fixed(2), @@ -450,77 +461,110 @@ def retry_get_new_video(page, cookies, post_count): raise TimeoutError("未获取到新视频(可能视频上传失败),已超时") -@retry( - stop=stop_after_attempt(3), - wait=wait_fixed(2), - retry=retry_if_exception(is_operation_failed) -) def playwright_post(cookies, content, image_key=None): path = os.path.join(BASE_PATH, 'chrome', '130-0008', 'chrome.exe') with lock: with sync_playwright() as playwright: update_windows_distinguish() - browser = playwright.chromium.launch( - headless=False, args=['--start-maximized'], executable_path=path - ) - context = browser.new_context(no_viewport=True) - context.add_cookies(parse_cookies(cookies)) - page = context.new_page() - page.evaluate(f'document.body.style.zoom = "{const.DISPLAY_SCALE}"') + max_browser_retries = 3 + last_error = None - check_account_status(page, parse_cookies(cookies)) - # 声明默认发布视频数量 - video_count = 0 + for browser_attempt in range(max_browser_retries): + browser = None + context = None + file_path = None + try: + browser = playwright.chromium.launch( + headless=False, args=['--start-maximized'], executable_path=path + ) + context = browser.new_context(no_viewport=True) + context.add_cookies(parse_cookies(cookies)) + page = context.new_page() - url = 'https://facebook.com' - try: - # 先获取视频数量 - if image_key is not None and ".mp4" in image_key: - video_count = get_post_count(page, cookies) + check_account_status(page, parse_cookies(cookies)) + # 声明默认发布视频数量 + video_count = 0 - retry_goto(page, url) - time.sleep(random.randint(3, 10)) - time.sleep(5) + url = 'https://www.facebook.com' + # 先获取视频数量 + if image_key is not None and ".mp4" in image_key: + video_count = get_post_count(page, cookies) - if image_key: - filename = image_key.split('/')[-1] - file_path = os.path.join(BASE_PATH, 'files', filename) - client.fget_object(BUCKET, image_key, file_path) - - sleep(1, 2) - page.locator('input[accept="image/*,image/heif,image/heic,video/*,video/mp4,video/x-m4v,video/x-matroska,.mkv"]').set_input_files(file_path) + # check_account_status 已经把页面带到 Facebook 首页,避免重复二次跳转导致页面进程崩溃 + if "facebook.com" not in page.url: + retry_goto(page, url) + time.sleep(random.randint(3, 10)) time.sleep(5) - if not image_key: - page.click('''//span[contains(text(), "What's on your mind")]''') - _edit_privacy(page) - # 修改后 (使用 fill) - page.fill('//div[contains(@aria-placeholder, "What\'s on your mind")]', content, - timeout=300000) - page.click('//div[@aria-label="Post"]', timeout=300000) - time.sleep(15) - post_index = page.locator('//div[@aria-posinset="1"]//a[@role="link"]').nth(2) - post_index.click(timeout=600000) - time.sleep(5) - page.reload(timeout=180000) - post_url = page.url - # 视频格式要单独去获取链接 - if image_key is not None and ".mp4" in image_key: - post_url = retry_get_new_video(page, cookies, video_count) - time.sleep(random.randint(3, 10)) + if image_key: + filename = image_key.split('/')[-1] + file_path = os.path.join(BASE_PATH, 'files', filename) + client.fget_object(BUCKET, image_key, file_path) - except Error as e: - raise OperationFailed(f'操作超时,请重试{e}') + sleep(1, 2) + page.locator('input[accept="image/*,image/heif,image/heic,video/*,video/mp4,video/x-m4v,video/x-matroska,.mkv"]').set_input_files(file_path) + time.sleep(5) - screenshot_content = _full_screenshot() - if image_key: - os.remove(file_path) - context.close() - browser.close() + if not image_key: + page.click('''//span[contains(text(), "What's on your mind")]''') + _edit_privacy(page) + # 修改后 (使用 fill) + page.fill('//div[contains(@aria-placeholder, "What\'s on your mind")]', content, + timeout=300000) + page.click('//div[@aria-label="Post"]', timeout=300000) + time.sleep(15) + post_index = page.locator('//div[@aria-posinset="1"]//a[@role="link"]').nth(2) + post_index.click(timeout=600000) + time.sleep(5) + page.reload(timeout=180000) + post_url = page.url + # 视频格式要单独去获取链接 + if image_key is not None and ".mp4" in image_key: + post_url = retry_get_new_video(page, cookies, video_count) + time.sleep(random.randint(3, 10)) - key = f'screenshot/{uuid.uuid4()}.png' - put_object(key, screenshot_content) - return {'response_url': post_url, 'screenshot_key': key} + screenshot_content = _full_screenshot() + key = f'screenshot/{uuid.uuid4()}.png' + put_object(key, screenshot_content) + return {'response_url': post_url, 'screenshot_key': key} + + except TimeoutError as e: + last_error = e + logger.warning( + f"发布任务超时,尝试重建浏览器重试: attempt {browser_attempt + 1}/{max_browser_retries}, error={e}" + ) + except Error as e: + last_error = e + if is_page_crash_error(e): + logger.warning( + f"发布任务页面崩溃,尝试重建浏览器重试: attempt {browser_attempt + 1}/{max_browser_retries}, error={e}" + ) + else: + logger.warning( + f"发布任务 Playwright 异常,尝试重试: attempt {browser_attempt + 1}/{max_browser_retries}, error={e}" + ) + finally: + if image_key and file_path and os.path.exists(file_path): + os.remove(file_path) + if context is not None: + try: + context.close() + except Exception: + pass + if browser is not None: + try: + browser.close() + except Exception: + pass + + if browser_attempt < max_browser_retries - 1: + time.sleep(2) + + if isinstance(last_error, TimeoutError): + raise OperationFailed(f'操作超时,请重试: {last_error}') + if isinstance(last_error, Error) and is_page_crash_error(last_error): + raise OperationFailed(f'页面崩溃,请重试: {last_error}') + raise OperationFailed(f'操作失败,请重试: {last_error}') def playwright_comment(cookies, target_url, content, image_key=None): @@ -772,11 +816,15 @@ def retry_goto(page: "Page", url: str, max_retries: int = 3, retry_delay: int = print(f"All {max_retries} attempts failed for {url}.") # If all retries fail, re-raise the exception raise e + except Error as e: + if is_page_crash_error(e): + logger.error(f"Navigation page crashed for {url} on attempt {attempt + 1}: {e}") + else: + logger.error(f"Navigation playwright error for {url} on attempt {attempt + 1}: {e}") + raise e except Exception as e: # Catch any other unexpected errors during goto print(f"An unexpected error occurred during navigation to {url} on attempt {attempt + 1}: {e}") - # Decide if other exceptions should also trigger retries - # For now, we'll just re-raise other exceptions immediately raise e