97 lines
3.5 KiB
Python
97 lines
3.5 KiB
Python
import asyncio
|
||
from datetime import datetime
|
||
|
||
import requests
|
||
|
||
RSS_URL_ARTICLES = "https://habr.com/ru/rss/hubs/artificial_intelligence/articles/top/daily/?fl=ru"
|
||
RSS_URL_POSTS = "https://habr.com/ru/rss/hubs/artificial_intelligence/news/top/daily/?fl=ru"
|
||
|
||
_session = requests.Session()
|
||
|
||
|
||
async def fetch_rss(url):
|
||
"""Скачать и распарсить RSS-ленту (RSS 2.0 / Atom)."""
|
||
from xml.etree import ElementTree
|
||
|
||
try:
|
||
response = await asyncio.to_thread(_session.get, url, timeout=10)
|
||
response.raise_for_status()
|
||
root = ElementTree.fromstring(response.content)
|
||
|
||
# RSS 2.0
|
||
ns_dc = {"dc": "http://purl.org/dc/elements/1.1/"}
|
||
items = root.findall(".//item")
|
||
if not items:
|
||
# Atom
|
||
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
||
items = root.findall("atom:entry", ns)
|
||
if not items:
|
||
return []
|
||
|
||
articles = []
|
||
for entry in items:
|
||
# RSS 2.0
|
||
title_el = entry.find("title")
|
||
date_el = entry.find("pubDate")
|
||
creator_el = entry.find("dc:creator", ns_dc)
|
||
categories = entry.findall("category")
|
||
|
||
# guid с isPermaLink="true" для чистого URL
|
||
guid_el = entry.find("guid[@isPermaLink='true']")
|
||
link = guid_el.text if guid_el is not None else ""
|
||
|
||
# Atom fallback
|
||
if title_el is None:
|
||
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
||
title_el = entry.find("atom:title", ns)
|
||
link_el = entry.find("atom:link", ns)
|
||
link = link_el.get("href", "") if link_el is not None else ""
|
||
date_el = entry.find("atom:published", ns)
|
||
creator_el = entry.find("atom:author/atom:name", ns)
|
||
categories = entry.findall("atom:category", ns)
|
||
|
||
title = title_el.text if title_el is not None else "Без названия"
|
||
pub_date = date_el.text if date_el is not None else ""
|
||
creator = creator_el.text if creator_el is not None else ""
|
||
tags = [cat.text for cat in categories if cat.text] if categories else []
|
||
|
||
articles.append({
|
||
"title": title,
|
||
"link": link,
|
||
"pub_date": pub_date,
|
||
"creator": creator,
|
||
"tags": tags,
|
||
})
|
||
return articles[:10]
|
||
except requests.exceptions.RequestException:
|
||
return None
|
||
|
||
|
||
def _parse_date(pub_date):
|
||
"""Парсить дату из RSS в строку 'дд.мм.гггг' или вернуть часть даты."""
|
||
if not pub_date:
|
||
return ""
|
||
try:
|
||
d = pub_date.replace(" GMT", " +0000")
|
||
dt = datetime.strptime(d, "%a, %d %b %Y %H:%M:%S %z")
|
||
return dt.strftime("%d.%m.%Y")
|
||
except ValueError:
|
||
return pub_date[:10].replace("-", ".")
|
||
|
||
|
||
def truncate_title(title, max_len=60):
|
||
"""Обрезать заголовок, если он длиннее max_len."""
|
||
if len(title) > max_len:
|
||
return title[:max_len] + "..."
|
||
return title
|
||
|
||
|
||
def format_articles(articles, title, link):
|
||
"""Сформировать список строк для вывода статей/постов."""
|
||
lines = [f"**{title}**\n<{link}>"]
|
||
for i, article in enumerate(articles[:5], 1):
|
||
date_str = _parse_date(article["pub_date"])
|
||
short_title = truncate_title(article["title"])
|
||
lines.append(f"{short_title}\n{date_str} <{article['link']}>")
|
||
return lines
|