利用AI爬虫网创资源网自动搬运自己网站
视频链接: https://www.bilibili.com/video/BV1eqrtY2E3V/?share_source=copy_web&vd_source=7156b0ab225d26e5e516f592cd4d94ce
下面是python代码,也是文章中用的代码.
import requests
from bs4 import BeautifulSoup
import time
import json
import re
def publish_article(user_url, postcategoryId, article, config):
try:
url = f"{user_url}/?xbzyk_plugin_post&private={config['autoPublishPrivate']}"
print(url)
content = article['content']
# 使用正则表达式找到并删除第一个 img 标签
content = re.sub(r'<img[^>]*>', '', content, count=1)
# 在内容开头添加 imgurl
if article.get('imgurl'):
img_tag = f'<img src="{article["imgurl"]}" alt="文章封面">'
content = img_tag + content
article['content'] = content
# 修改标题,添加时间戳避免重复
current_time = time.strftime("%m%d%H%M", time.localtime())
modified_title = f"{article['title']}_{current_time}"
# 构造payload数据
payload = {
"post_title": modified_title, # 添加时间戳的标题
"content": '1'+article['content'] + ' </p></p>[hidecontent type=payshow]' + article.get('pwd', '') + '[/hidecontent]', # 文章内容
"post_category": f"{postcategoryId},",
"post_topic": "topic1,topic2",
"topic_name": "topic",
"post_date": int(time.time()),
"riplus_down_info": "Your download info",
"rizhuti_wppay_down": "Some WP pay download info",
"zibi_posts_zibpay": {
"pay_type": "1",
"pay_price": config['normalUserPrice'],
"vip_1_price": config['vipUserPrice'],
"vip_2_price": config['vipUserPrice'],
"pay_original_price": "99",
"pay_download": [{"link": "", "more": ""} for _ in range(3)],
"pay_cuont": "",
"pay_title": "",
"pay_doc": "",
"pay_extra_hide": "",
"pay_details": "",
"pay_rebate_discount": "",
"attributes": [{"key": "", "value": ""} for _ in range(2)],
"demo_link": [{"url": ""}],
"pay_modo": "",
"points_price": "",
"vip_1_points": "",
"vip_2_points": "",
"pay_limit": "0"
},
"b2_download": "Download info for B2 theme",
"ceo_shop_virtual_info": "Ceo shop virtual info",
"cao_downurl_new": "New download URL"
}
json_payload = json.dumps(payload)
for attempt in range(3):
try:
response = requests.post(
url,
headers=API_CONFIG['headers'],
data=json_payload,
timeout=10,
verify=False
)
print(f"{url} 的发送结果为:")
print(response.text)
if response.status_code == 200:
return True
print(f"发送失败,状态码: {response.status_code},正在重试...")
except requests.exceptions.RequestException as e:
print(f"请求异常: {str(e)},正在重试...")
if attempt == 2:
raise
time.sleep(2)
return False
except Exception as e:
print(f"发布文章时发生错误: {str(e)}")
return False
def get_article_detail(url, headers):
try:
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
title_element = soup.find('h1', class_='article-title')
title = title_element.find('a').text if title_element else "标题未找到"
content_element = soup.find('div', class_='article-content')
if content_element:
# 找到所有图片标签
for img in content_element.find_all('img'):
# 如果src是相对路径(以/开头)
if img.get('src', '').startswith('/'):
# 添加完整的网站地址
img['src'] = f"http://www.xbxm.cc{img['src']}"
content = str(content_element) # 保留HTML标签
else:
content = "内容未找到"
return {
'title': title,
'content': content
}
except Exception as e:
print(f"获取文章详情时发生错误: {str(e)}")
return None
def crawl_website(user_url, postcategoryId, config):
base_url = "http://www.xbxm.cc/index.php/page/{}/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
for page in range(1, 4):
try:
url = base_url.format(page)
print(f"\n正在爬取第{page}页: {url}")
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
items = soup.find_all('h2', class_='item-heading')
if items:
for item in items:
link = item.find('a')
if link and link.get('href'):
article_url = link['href']
print(f"\n找到文章链接: {article_url}")
article_detail = get_article_detail(article_url, headers)
if article_detail:
print(f"文章标题: {article_detail['title']}")
print(f"文章内容预览: {article_detail['content'][:200]}...")
# 发布文章
if publish_article(user_url, postcategoryId, article_detail, config):
print("文章发布成功!")
else:
print("文章发布失败!")
time.sleep(1)
else:
print(f"第{page}页没有找到item-heading元素")
except Exception as e:
print(f"爬取第{page}页时发生错误: {str(e)}")
if __name__ == "__main__":
# 配置信息
config = {
'autoPublishPrivate': 'fb69b69a7bb331f3a36fed26501d9383', # 设置为指定的密钥
'normalUserPrice': '3', # 普通用户价格改为3
'vipUserPrice': '3' # VIP用户价格改为3
}
# API配置
API_CONFIG = {
'headers': {
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
}
# 调用爬虫并发布文章
user_url = "http://www.xbxm.cc" # 设置为百度网址
postcategoryId = "3" # 设置分类ID为3
crawl_website(user_url, postcategoryId, config)
声明:本站所有文章,如无特殊说明或标注,均为本站原创发布。任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。如若本站内容侵犯了原著者的合法权益,可联系我们进行处理。