discordBot/utils/news.py

97 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
from datetime import datetime
import requests
RSS_URL_ARTICLES = "https://habr.com/ru/rss/hubs/artificial_intelligence/articles/top/daily/?fl=ru"
RSS_URL_POSTS = "https://habr.com/ru/rss/hubs/artificial_intelligence/news/top/daily/?fl=ru"
_session = requests.Session()
async def fetch_rss(url):
"""Скачать и распарсить RSS-ленту (RSS 2.0 / Atom)."""
from xml.etree import ElementTree
try:
response = await asyncio.to_thread(_session.get, url, timeout=10)
response.raise_for_status()
root = ElementTree.fromstring(response.content)
# RSS 2.0
ns_dc = {"dc": "http://purl.org/dc/elements/1.1/"}
items = root.findall(".//item")
if not items:
# Atom
ns = {"atom": "http://www.w3.org/2005/Atom"}
items = root.findall("atom:entry", ns)
if not items:
return []
articles = []
for entry in items:
# RSS 2.0
title_el = entry.find("title")
date_el = entry.find("pubDate")
creator_el = entry.find("dc:creator", ns_dc)
categories = entry.findall("category")
# guid с isPermaLink="true" для чистого URL
guid_el = entry.find("guid[@isPermaLink='true']")
link = guid_el.text if guid_el is not None else ""
# Atom fallback
if title_el is None:
ns = {"atom": "http://www.w3.org/2005/Atom"}
title_el = entry.find("atom:title", ns)
link_el = entry.find("atom:link", ns)
link = link_el.get("href", "") if link_el is not None else ""
date_el = entry.find("atom:published", ns)
creator_el = entry.find("atom:author/atom:name", ns)
categories = entry.findall("atom:category", ns)
title = title_el.text if title_el is not None else "Без названия"
pub_date = date_el.text if date_el is not None else ""
creator = creator_el.text if creator_el is not None else ""
tags = [cat.text for cat in categories if cat.text] if categories else []
articles.append({
"title": title,
"link": link,
"pub_date": pub_date,
"creator": creator,
"tags": tags,
})
return articles[:10]
except requests.exceptions.RequestException:
return None
def _parse_date(pub_date):
"""Парсить дату из RSS в строку 'дд.мм.гггг' или вернуть часть даты."""
if not pub_date:
return ""
try:
d = pub_date.replace(" GMT", " +0000")
dt = datetime.strptime(d, "%a, %d %b %Y %H:%M:%S %z")
return dt.strftime("%d.%m.%Y")
except ValueError:
return pub_date[:10].replace("-", ".")
def truncate_title(title, max_len=60):
"""Обрезать заголовок, если он длиннее max_len."""
if len(title) > max_len:
return title[:max_len] + "..."
return title
def format_articles(articles, title, link):
"""Сформировать список строк для вывода статей/постов."""
lines = [f"**{title}**\n<{link}>"]
for i, article in enumerate(articles[:5], 1):
date_str = _parse_date(article["pub_date"])
short_title = truncate_title(article["title"])
lines.append(f"{short_title}\n{date_str} <{article['link']}>")
return lines