爬取csdn的数据与解析存储(9)

摘要:
left_menu_text=requests.get(“https:csdn”).textnodes_str_tatch=re.search(“forumNodes:”None“)nodes_list=ast.literal_eval(nodes_str)returnnodes_listreturn[]url_list=[]defprocess_nodes_list(nodes_list):

安装软件:

pip instal pymysq

pip install peewee

创建数据模型orm

from peewee import *

db = MySQLDatabase("spider", host="127.0.0.1", port=3306, user="root", password="root")

class BaseModel(Model):
    class Meta:
        database = db

#设计数据表的时候有几个重要点一定要注意
"""
char类型, 要设置最大长度
对于无法确定最大长度的字段,可以设置为Text
设计表的时候 采集到的数据要尽量先做格式化处理
default和null=True
"""


class Topic(BaseModel):
    title = CharField()
    content = TextField(default="")
    id = IntegerField(primary_key=True)
    author = CharField()
    create_time = DateTimeField()
    answer_nums = IntegerField(default=0)
    click_nums = IntegerField(default=0)
    praised_nums = IntegerField(default=0)
    jtl = FloatField(default=0.0)  # 结帖率
    score = IntegerField(default=0)  # 赏分
    status = CharField()  # 状态
    last_answer_time = DateTimeField()


class Answer(BaseModel):
    topic_id = IntegerField()
    author = CharField()
    content = TextField(default="")
    create_time = DateTimeField()
    parised_nums = IntegerField(default=0) #点赞数


class Author(BaseModel):
    name = CharField()
    id = CharField(primary_key=True)
    click_nums = IntegerField(default=0) #访问数
    original_nums = IntegerField(default=0) #原创数
    forward_nums = IntegerField(default=0)  # 转发数
    rate = IntegerField(default=-1)  # 排名
    answer_nums = IntegerField(default=0)  # 评论数
    parised_nums = IntegerField(default=0)  # 获赞数
    desc = TextField(null=True)
    industry = CharField(null=True)
    location = CharField(null=True)
    follower_nums = IntegerField(default=0)  # 粉丝数
    following_nums = IntegerField(default=0)  # 关注数

if __name__ == "__main__":
    db.create_tables([Topic, Answer, Author])

数据爬取与解析:

"""
抓取
解析
存储
"""
import re
import ast
from urllib import parse
from datetime import datetime

import requests
from scrapy import Selector

from csdn_spider.models import *

domain = "https://bbs.csdn.net"
def get_nodes_json():
    left_menu_text = requests.get("https://bbs.csdn.net/dynamic_js/left_menu.js?csdn").text
    nodes_str_match = re.search("forumNodes: (.*])", left_menu_text)
    if nodes_str_match:
        nodes_str = nodes_str_match.group(1).replace("null", "None")
        nodes_list = ast.literal_eval(nodes_str)
        return nodes_list
    return []

url_list = []
def process_nodes_list(nodes_list):
    #将js的格式提取出url到list中
    for item in nodes_list:
        if "url" in item:
            if item["url"]:
                url_list.append(item["url"])
            if "children" in item:
                process_nodes_list(item["children"])

def get_level1_list(nodes_list):
    level1_url = []
    for item in nodes_list:
        if "url" in item and item["url"]:
            level1_url.append(item["url"])

    return level1_url

def get_last_urls():
    #获取最终需要抓取的url
    nodes_list = get_nodes_json()
    process_nodes_list(nodes_list)
    level1_url = get_level1_list(nodes_list)
    last_urls = []
    for url in url_list:
        if url not in level1_url:
            last_urls.append(url)
    all_urls = []
    for url in last_urls:
        all_urls.append(parse.urljoin(domain, url))
        all_urls.append(parse.urljoin(domain, url+"/recommend"))
        all_urls.append(parse.urljoin(domain, url+"/closed"))
    return all_urls


def parse_topic(url):
    #获取帖子的详情以及回复
    topic_id = url.split("/")[-1]
    res_text = requests.get(url).text
    sel = Selector(text=res_text)
    all_divs = sel.xpath("//div[starts-with(@id, 'post-')]")
    topic_item = all_divs[0]
    content = topic_item.xpath(".//div[@class='post_body post_body_min_h']").extract()[0]
    praised_nums = topic_item.xpath(".//label[@class='red_praise digg']//em/text()").extract()[0]
    jtl_str = topic_item.xpath(".//div[@class='close_topic']/text()").extract()[0]
    jtl = 0
    jtl_match = re.search("(d+)%", jtl_str)
    if jtl_match:
        jtl = int(jtl_match.group(1))
    existed_topics = Topic.select().where(Topic.id == topic_id)
    if existed_topics:
        topic = existed_topics[0]
        topic.content = content
        topic.jtl = jtl
        topic.praised_nums = praised_nums
        topic.save()

    for answer_item in all_divs[1:]:
        answer = Answer()
        answer.topic_id = topic_id
        author_info = answer_item.xpath(".//div[@class='nick_name']//a[1]/@href").extract()[0]
        author_id = author_info.split("/")[-1]
        create_time = answer_item.xpath(".//label[@class='date_time']/text()").extract()[0]
        create_time = datetime.strptime(create_time, "%Y-%m-%d %H:%M:%S")
        answer.author = author_id
        answer.create_time = create_time
        praised_nums = topic_item.xpath(".//label[@class='red_praise digg']//em/text()").extract()[0]
        answer.parised_nums = int(praised_nums)
        content = topic_item.xpath(".//div[@class='post_body post_body_min_h']").extract()[0]
        answer.content = content

        answer.save()

    next_page = sel.xpath("//a[@class='pageliststy next_page']/@href").extract()
    if next_page:
        next_url = parse.urljoin(domain, next_page[0])
        parse_topic(next_url)


def parse_author(url):
    author_id = url.split("/")[-1]
    # 获取用户的详情
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
    }
    res_text = requests.get(url, headers=headers).text
    sel = Selector(text=res_text)
    author = Author()
    author.id = author_id
    all_li_strs = sel.xpath("//ul[@class='mod_my_t clearfix']/li/span/text()").extract()
    click_nums = all_li_strs[0]
    original_nums = all_li_strs[1]
    forward_nums = int(all_li_strs[2])
    rate = int(all_li_strs[3])
    answer_nums = int(all_li_strs[4])
    parised_nums = int(all_li_strs[5])

    author.click_nums = click_nums
    author.original_nums = original_nums
    author.forward_nums = forward_nums
    author.rate = rate
    author.answer_nums = answer_nums
    author.parised_nums = parised_nums

    desc = sel.xpath("//dd[@class='user_desc']/text()").extract()
    if desc:
        author.desc = desc[0].strip()
    person_b = sel.xpath("//dd[@class='person_b']/ul/li")
    for item in person_b:
        item_text = "".join(item.extract())
        if "csdnc-m-add" in item_text:
            location = item.xpath(".//span/text()").extract()[0].strip()
            author.location = location
        else:
            industry = item.xpath(".//span/text()").extract()[0].strip()
            author.industry = industry
    name = sel.xpath("//h4[@class='username']/text()").extract()[0]
    author.name = name.strip()
    existed_author = Author.select().where(Author.id == author_id)
    if existed_author:
        author.save()
    else:
        author.save(force_insert=True)


def parse_list(url):
    res_text = requests.get(url).text
    sel = Selector(text=res_text)
    all_trs = sel.xpath("//table[@class='forums_tab_table']//tr")[2:]
    for tr in all_trs:
        topic = Topic()

        if tr.xpath(".//td[1]/span/text()").extract():
            status = tr.xpath(".//td[1]/span/text()").extract()[0]
            topic.status = status
        if tr.xpath(".//td[2]/em/text()").extract():
            score = tr.xpath(".//td[2]/em/text()").extract()[0]
            topic.score = int(score)
        topic_url = parse.urljoin(domain, tr.xpath(".//td[3]/a/@href").extract()[0])
        topic_title = tr.xpath(".//td[3]/a/text()").extract()[0]
        author_url = parse.urljoin(domain,tr.xpath(".//td[4]/a/@href").extract()[0])
        author_id = author_url.split("/")[-1]
        create_time = tr.xpath(".//td[4]/em/text()").extract()[0]
        create_time = datetime.strptime(create_time, "%Y-%m-%d %H:%M")
        answer_info = tr.xpath(".//td[5]/span/text()").extract()[0]
        answer_nums = answer_info.split("/")[0]
        click_nums = answer_info.split("/")[1]
        last_time_str = tr.xpath(".//td[6]/em/text()").extract()[0]
        last_time = datetime.strptime(last_time_str, "%Y-%m-%d %H:%M")

        topic.id = int(topic_url.split("/")[-1])
        topic.title = topic_title
        topic.author = author_id
        topic.click_nums = int(click_nums)
        topic.answer_nums = int(answer_nums)
        topic.create_time = create_time
        topic.last_answer_time = last_time
        existed_topics = Topic.select().where(Topic.id==topic.id)
        if existed_topics:
            topic.save()
        else:
            topic.save(force_insert=True)

        parse_topic(topic_url)
        # parse_author(author_url)

    next_page = sel.xpath("//a[@class='pageliststy next_page']/@href").extract()
    if next_page:
        next_url = parse.urljoin(domain, next_page[0])
        parse_list(next_url)


if __name__ == "__main__":
    last_urls = get_last_urls()
    for url in last_urls:
        parse_list(url)
    print(last_urls)

免责声明:文章转载自《爬取csdn的数据与解析存储(9)》仅用于学习参考。如对内容有疑问,请及时联系本站处理。

上篇Layui upload 上传有进度条Openstack_通用技术_RPC 远程异步调用下篇

宿迁高防,2C2G15M,22元/月;香港BGP,2C5G5M,25元/月 雨云优惠码:MjYwNzM=

相关文章

python爬取免费西祠代理

#!/usr/local/bin/python3.7 """ @File : xicidaili.py @Time : 2020/06/02 @Author : Mozili """ import urllib.request import urllib.parse from lxml import etree import...

Django 语法笔记

Django 语法创建项目框架 django-admin startproject 项目名 创建子app 业务分化,可以优化团队合作,可以明确找锅 python manage.py startapp 子app名 启动项目 python manage.py runserver 项目目录结构 __init__.py: 声明当前文件夹为一个可导入的包 se...

DGL学习(二): 使用DGL构造图

有许多方法可以构造DGLGraph。文档中建议使用的方法有四种,分别如下: ① 使用两个数组,分别存储源节点和目标节点对象 (数组类型可以是numpy 也可以是 tensor)。 ② scipy 中的稀疏矩阵(),表示要构造的图的邻接矩阵。 ③ networkx 的图对象(DGLGraph 和 networkx 可以互转)。 ④ 整数对形式的边列表。 下面...

python中csv文件的读取问题

在python读取csv格式的文件时,使用csv.reader读取文件对象,出现了line contains NULL byte的错误,如下: reader = csv.reader(open(filepath, "rU")) try: for row in reader: print 'Row read successfully!...

ant design pro项目配置路由菜单

有两种菜单格式,一种是就只有一层,一种是有多层的 一、单层的菜单  首先在pages目录下新建一个文件目录  然后在config.js中进行配置 { name: 'new_test', icon: 'table', path: '/new_test', component: './TestRoute/test_rout...

pyqt5 + pyinstaller 制作爬虫小程序

环境:mac python3.7 pyqt5 pyinstaller ps: 主要是熟悉pyqt5, 加入了单选框 输入框 文本框 文件夹选择框及日历下拉框 效果图: pyqt5 主程序文件  # -*- coding: utf-8 -*- # @Author: Mehaei # @Date: 2019-07-10 13:02:56 # @Last...