[PARSING] urlparss.py

Google Docs neutral 2026-04-11 7 чанков ~5 мин чтения

Сущности

#!/usr/bin/env python3 # -*- coding: utf-8 -*- import sys import time import re import requests from pathlib import Path from playwright.sync_api import sync_playwright from newspaper import Article from youtube_transcript_api import ( YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound, VideoUnavailable ) from pytube import YouTube from bs4 import BeautifulSoup # Попытка импортировать Readability (readability-lxml) try: from readability import Document except ImportError: print("Установите readability-lxml (pip install readability-lxml)") sys.exit(1) # ----------------------------------------------------------------------------- # Альтернативное извлечение с помощью Readability # ----------------------------------------------------------------------------- def extract_with_readability(html): try: doc = Document(html) title = doc.title() # извлекает заголовок summary_html = doc.summary() # извлекает HTML основного контента soup = BeautifulSoup(summary_html, "html.parser") text = soup.get_text(separator="\n").strip() return title, text except Exception as e: print("Ошибка извлечения через Readability:", e) return "", "" # ----------------------------------------------------------------------------- # Базовое извлечение статьи с использованием Newspaper3k и альтернативных методов # ----------------------------------------------------------------------------- def extract_article_info(html, url): # Определяем язык страницы по атрибуту <html lang="..."> soup = BeautifulSoup(html, "html.parser") html_tag = soup.find("html") language = "ru" if html_tag and html_tag.has_attr("lang"): lang = html_tag["lang"].lower() if lang.startswith("en"): language = "en" elif lang.startswith("kk"): language = "ru" elif lang.startswith("ru"): language = "ru" else: language = lang if "reuters.com" in url: language = "en" # Попытка извлечь данные с помощью Newspaper3k article = Article(url=url, language=language) article.download(input_html=html) article.parse() data = { "title": article.title.strip() if article.title else "", "authors": article.authors or [], "publish_date": str(article.publish_date).strip() if article.publish_date else "", "text": article.text.strip() if article.text else "" } # Попытка извлечь метаданные из meta-тегов meta_title = soup.find("meta", property="og:title") if meta_title and meta_title.get("content"): meta_title_val = meta_title["content"].strip() if meta_title_val.lower() not in ["reuters.com", ""]: data["title"] = meta_title_val meta_date = soup.find("meta", property="article:published_time") if meta_date and meta_date.get("content"): data["publish_date"] = meta_date["content"].strip() # Если для Reuters заголовок некорректный или текст слишком короткий, используем Readability if ("reuters.com" in url and (not data["title"] or data["title"].strip().lower() == "reuters.com")) or len(data["text"]) < 200: rb_title, rb_text = extract_with_readability(html) if rb_title and rb_title.lower() != "reuters.com": data["title"] = rb_title if rb_text and len(rb_text) > len(data["text"]): data["text"] = rb_text # Дополнительное: если авторы не найдены, пытаемся извлечь строку, начинающуюся с "By" if not data["authors"]: alt_text = soup.get_text(separator="\n") m_authors = re.search(r"By\s+(.+)", alt_text, re.IGNORECASE) if m_authors: authors_line = m_authors.group(1).strip() # Разбиваем по запятым или "and" authors = re.split(r",|\band\b", authors_line) data["authors"] = [a.strip() for a in authors if a.strip()] return data def post_process_article(data): # Возможное дополнительное очищение текста и авторов title = data.get("title", "").strip() authors = data.get("authors", []) pub_date = data.get("publish_date", "").strip() text = data.get("text", "") match = re.search(r"(\d{2}\.\d{2}\.\d{4}),\s*автор\s+([^.]+)", text) if match: pub_date = match.group(1).strip() authors = [match.group(2).strip()] text = text.replace(match.group(0), "") filtered_authors = [a for a in authors if re.search(r"[A-Za-zА-ЯЁа-яё]", a)] text = re.sub(r"\b(RU|KZ|EN)\b", "", text) text = re.sub(r"media@sputniknews\.com\S*", "", text) text = re.sub(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}\+\d{4}", "", text) text = re.sub(r"\n\s*\n+", "\n\n", text).strip() return { "title": title, "authors": filtered_authors, "publish_date": pub_date, "text": text, } # ----------------------------------------------------------------------------- # Функции для обработки YouTube-ссылок (без изменений) # ----------------------------------------------------------------------------- def format_duration(seconds): m, s = divmod(seconds, 60) h, m = divmod(m, 60) return f"{h:02d}:{m:02d}:{s:02d}" def get_youtube_metadata(url): try: yt = YouTube(url) video_title = yt.title channel_name = yt.author video_duration = format_duration(yt.length) video_description = yt.description if not video_title or not channel_name: raise Exception("Неполучены метаданные через pytube") return video_title, channel_name, video_duration, video_description except Exception as e: print(f"Ошибка получения метаданных через pytube: {e}") try: import yt_dlp except ImportError: print("Модуль yt_dlp не установлен. Установите его через: pip install yt-dlp") return "", "", "", "" ydl_opts = { "skip_download": True, "quiet": True, "no_warnings": True, "format": "best", } with yt_dlp.YoutubeDL(ydl_opts) as ydl: try: info = ydl.extract_info(url, download=False) video_title = info.get("title", "") channel_name = info.get("uploader", "") duration = info.get("duration", 0) video_duration = format_duration(duration) video_description = info.get("description", "") return video_title, channel_name, video_duration, video_description except Exception as e: print(f"Ошибка получения метаданных через yt_dlp: {e}") return "", "", "", "" def get_video_id(yt_url): patterns = [ r"v=([A-Za-z0-9_\-]{5,})", r"youtu\.be/([A-Za-z0-9_\-]{5,})", ] for pat in patterns: match = re.search(pat, yt_url) if match: return match.group(1) return None def fetch_subtitles(video_id, languages=("ru", "en")): try: transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=list(languages)) except TranscriptsDisabled: return "Субтитры отключены для этого видео." except NoTranscriptFound: return "Субтитры для данного видео не найдены." except VideoUnavailable: return "Видео недоступно или удалено." except Exception as e: return f"Ошибка при получении субтитров: {e}" lines = [item["text"] for item in transcript_list] full_text = "\n".join(lines) return full_text def process_youtube_link(url): video_id = get_video_id(url) if not video_id: print("Не удалось определить video_id из ссылки. Пропускаем.") return None, None, None, None, None, None video_title, channel_name, video_duration, video_description = get_youtube_metadata(url) subtitles_text = fetch_subtitles(video_id) subtitles_text = " ".join(subtitles_text.split()) return video_id, subtitles_text, video_title, channel_name, video_duration, video_description # ----------------------------------------------------------------------------- # Основная функция для обработки ссылок # ----------------------------------------------------------------------------- # Увеличенное время ожидания после загрузки страницы до 5000 мс PLAYWRIGHT_TIMEOUT = 60000 WAIT_AFTER_LOAD = 5000 def render_with_playwright(url): with sync_playwright() as p: browser = p.chromium.launch(headless=True) context = browser.new_context( user_agent=( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, как Gecko) " "Chrome/98.0.4758.102 Safari/537.36" ), ignore_https_errors=True ) page = context.new_page() page.goto(url, timeout=PLAYWRIGHT_TIMEOUT) page.wait_for_load_state("domcontentloaded", timeout=PLAYWRIGHT_TIMEOUT) page.wait_for_timeout(WAIT_AFTER_LOAD) html = page.content() browser.close() return html def fallback_get(url): r = requests.get(url, timeout=60) r.raise_for_status() return r.text def process_link(url): attempts = 0 html = None while attempts < 3 and html is None: try: html = render_with_playwright(url) except Exception as e: print(f"[Playwright] Попытка {attempts+1} не удалась: {e}") attempts += 1 time.sleep(2) if html is None: print(f"[Fallback] Обращаемся к requests для {url}") try: html = fallback_get(url) except Exception as e: print(f"Ошибка и в fallback для {url}: {e}") return {"title": "", "authors": [], "publish_date": "", "text": "Не удалось обработать страницу."} data = extract_article_info(html, url) clean_data = post_process_article(data) return clean_data # ----------------------------------------------------------------------------- # Главная функция обработки ссылок # ----------------------------------------------------------------------------- def main(): try: with open("link.txt", "r", encoding="utf-8") as f: urls = [line.strip() for line in f if line.strip()] except Exception as e: print(f"Ошибка чтения link.txt: {e}") sys.exit(1) if not urls: print("Файл link.txt пуст или не содержит ссылок.") sys.exit(0) for i, url in enumerate(urls, start=1): # Игнорирование ссылок на telegram, instagram и gov.kz if any(forbidden in url for forbidden in ["telegram", "instagram", "gov.kz"]): print(f"Пропускаем ссылку (запрещенный домен): {url}") continue print(f"=== Обработка {i}/{len(urls)}: {url}") video_id = get_video_id(url) if video_id: result = process_youtube_link(url) if result[1] is None: print("Ошибка при обработке YouTube ссылки, пропускаем.") continue video_id, subtitles_text, video_title, channel_name, video_duration, video_description = result output_filename = Path(f"result_{i}.txt") with open(output_filename, "w", encoding="utf-8") as fout: fout.write(f"URL: {url}\n") fout.write(f"VIDEO_ID: {video_id}\n") fout.write(f"Канал: {channel_name}\n") fout.write(f"Название видео: {video_title}\n") fout.write(f"Длительность: {video_duration}\n") fout.write(f"Описание:\n{video_description}\n\n") fout.write("Субтитры:\n") fout.write(subtitles_text + "\n") print(f" => Сохранено в {output_filename}\n") else: data = process_link(url) output_filename = Path(f"result_{i}.txt") with open(output_filename, "w", encoding="utf-8") as fout: fout.write(f"URL: {url}\n") fout.write(f"Заголовок: {data['title']}\n") fout.write(f"Автор(ы): {data['authors']}\n") fout.write(f"Дата: {data['publish_date']}\n\n") fout.write(data['text'] + "\n") print(f" => Сохранено в {output_filename}\n") if __name__ == "__main__": main()