1.安装
pip install Scrapy
#一定要以管理员身份运行dos窗口
conda install scrapy
2.创建项目
scrapy startproject novel
3.创建qidianClass4.py文件,爬取小说一级分类,二级分类,名称和链接,分别存入mongdb和redis库中对应表中
import scrapy from scrapy.selector import HtmlXPathSelector from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request # from hello.items import ZhaopinItem # from scrapy.spiders import CrawlSpider, Rule # from scrapy.linkextractors import LinkExtractor from urllib.request import urlopen #from urllib.request import Request from bs4 import BeautifulSoup from lxml import etree from bson.objectid import ObjectId import pymongo client = pymongo.MongoClient(host="127.0.0.1") db = client.novel #库名dianping collection = db.novelclass import redis r = redis.Redis(host='127.0.0.1', port=6379, db=0) class qidianClassSpider(scrapy.Spider): name = "qidianClass4" allowed_domains = ["qidian.com"] # 允许访问的域 start_urls = [ "https://www.qidian.com/all", ] # #每爬完一个网页会回调parse方法 # def parse(self, response): # print(response.body.decode('utf-8')) def parse(self, response): hxs = HtmlXPathSelector(response) hxsObj = hxs.select('//div[@class="work-filter type-filter"]/ul[@type="category"]/li[@class=""]/a') for secItem in hxsObj: className = secItem.select('text()').extract() classUrl = secItem.select('@href').extract() classUrl = 'https:' + classUrl[0] print(className[0]) print(classUrl) classid = self.insertMongo(className[0],None) request = Request(classUrl, callback=lambda response, pid=str(classid): self.parse_subClass(response, pid)) yield request print("======================") def parse_subClass(self, response,pid): hxs = HtmlXPathSelector(response) hxsObj = hxs.select('//div[@class="sub-type"]/dl[@class=""]/dd[@class=""]/a') for secItem in hxsObj: className2 = secItem.select('text()').extract() classUrl2 = secItem.select('@href').extract() print(className2) print('----------------------------') classUrl2 = 'https:' + classUrl2[0] print(classUrl2) classid = self.insertMongo(className2[0], ObjectId(pid)) self.pushRedis(classid, pid, classUrl2) def insertMongo(self, classname, pid): classid = collection.insert({'classname': classname, 'pid': pid}) return classid def pushRedis(self, classid, pid, url): novelurl = '%s,%s,%s' % (classid, pid, url) r.lpush('novelurl', novelurl)
4..创建qidianNovel.py文件,爬取小说名称和链接,分别存入mongdb和redis库中对应表中
# -*- coding: utf-8 -*- import scrapy from scrapy.selector import HtmlXPathSelector # from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request # from hello.items import ZhaopinItem # from scrapy.spiders import CrawlSpider, Rule from time import sleep # from scrapy.linkextractors import LinkExtractor import pymongo client = pymongo.MongoClient(host="127.0.0.1") db = client.novel # 库名dianping collection = db.novelname import redis # 导入redis数据库 r = redis.Redis(host='127.0.0.1', port=6379, db=0) ii = 0 class qidianNovelSpider(scrapy.Spider): name = "qidianNovel" allowed_domains = ["qidian.com"] # 允许访问的域 def __init__(self): # global pid # 查询reids库novelurl # qidianNovelSpider.start_urls=["https://www.qidian.com/all",] start_urls = [] urlList = r.lrange('novelurl', 0, -1) ii = 0 self.dict = {} for item in urlList: itemStr = str(item, encoding="utf-8") arr = itemStr.split(',') classid = arr[0] pid = arr[1] url = arr[2] start_urls.append(url) self.dict[url] = {"classid": classid, "pid": pid, "num": 0} # ii += 1 # if ii > 3: # break print(start_urls) self.start_urls = start_urls def parse(self, response): classInfo = self.dict[response.url] objectid = classInfo['classid'] pid = classInfo['pid'] num = classInfo['num'] if num > 3: return None hxs = HtmlXPathSelector(response) hxsObj = hxs.select('//div[@class="book-mid-info"]/h4/a') for secItem in hxsObj: className = secItem.select('text()').extract() classUrl = secItem.select('@href').extract() classUrl = 'https:' + classUrl[0] print(className[0]) print(classUrl) classid =self.insertMongo(className[0],objectid) self.pushRedis(classid,objectid, classUrl) nextPage = self.nextUrl(response) # sleep(0.3) # --------------------------不用调用方法直接取下一页------------------------------------------------------------------------------ # nextPages= hxs.select('//li[@class="lbf-pagination-item"]/a[@class="lbf-pagination-next "]') # nextPages = nextPages.select('@href').extract() # nextPage = "https:" + nextPages[0] classInfo['num'] += 1 self.dict[nextPage] = classInfo request = Request(nextPage, callback=self.parse) yield request print('--------end--------------') # --------------------------------------------------------------------------------------------------------------- # ===================获取下一页链接方法======================================================= def nextUrl(self, response): hxs = HtmlXPathSelector(response) # nextPage = hxs.select('//li[@class="lbf-pagination-item"]/a[@class="lbf-pagination-next "]') nextPage = hxs.select('//a[@class="lbf-pagination-next "]') # print(nextPage.extract()) if len(nextPage) == 1: nextPage = nextPage.select('@href').extract() nextPage = "https:" + nextPage[0] print('==============' + nextPage + '====================') return nextPage # =====================获取下一页链接结束================================================== def insertMongo(self, className, pid): classid = collection.insert({'classname': className, 'pid': pid}) return classid def pushRedis(self, classid, pid, classUrl): novelnameurl = '%s,%s,%s,' % (classid, pid, classUrl) r.lpush('novelnameurl', novelnameurl)
5.创建qidianNovelChapterInfo.py文件,爬取小说名称下的章节和链接,分别存入mongdb和redis库的中的对应表中
# -*- coding: utf-8 -*- import scrapy from scrapy.selector import HtmlXPathSelector # from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request # from hello.items import ZhaopinItem # from scrapy.spiders import CrawlSpider, Rule from time import sleep # from scrapy.linkextractors import LinkExtractor from lxml import etree import pymongo client = pymongo.MongoClient(host="127.0.0.1") db = client.novel # 库名dianping collection = db.novelChapterInfo import redis # 导入redis数据库 r = redis.Redis(host='127.0.0.1', port=6379, db=0) ii = 0 class qidianNovelSpider(scrapy.Spider): name = "qidianNovelChapterInfo" allowed_domains = ["qidian.com"] # 允许访问的域 def __init__(self): # global pid # 查询reids库novelurl # qidianNovelSpider.start_urls=["https://www.qidian.com/all",] start_urls = [] urlList = r.lrange('novelnameurl', 0, -1) ii = 0 self.dict = {} for item in urlList: itemStr = str(item, encoding="utf-8") arr = itemStr.split(',') classid = arr[0] pid = arr[1] url = arr[2] start_urls.append(url) self.dict[url] = {"classid": classid, "pid": pid, "num": 0} # ii += 1 # if ii > 1: # break print(start_urls) self.start_urls = start_urls def parse(self, response): classInfo = self.dict[response.url] objectid = classInfo['classid'] pid = classInfo['pid'] # num = classInfo['num'] # if num > 3: # return None html = response.body.decode('utf-8') selector = etree.HTML(html) novelChapters = selector.xpath('//ul[@class="cf"]/li/a') for item in novelChapters: novelChapter= item.text print(item.text) novelChapterUrl='https:'+item.get('href') print(novelChapterUrl) # print(item.get('href')) classid = self.insertMongo(novelChapter, objectid) self.pushRedis(classid, objectid, novelChapterUrl) def insertMongo(self,novelChapter, pid): classid = collection.insert({'novelChapter': novelChapter,'pid': pid}) return classid def pushRedis(self, classid,pid, novelChapterUrl): novelChapterUrl = '%s,%s,%s' % ( classid , pid, novelChapterUrl) r.lpush('novelChapterUrl', novelChapterUrl)
6.创建qidianNovelWorksInfo.py文件,爬取小说基本信息,更新到原有的存小说名称的mongdb(novel)库小说名称表中
# -*- coding: utf-8 -*- import scrapy from scrapy.selector import HtmlXPathSelector # from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request # from hello.items import ZhaopinItem # from scrapy.spiders import CrawlSpider, Rule from time import sleep # from scrapy.linkextractors import LinkExtractor from lxml import etree import pymongo from bson.objectid import ObjectId client = pymongo.MongoClient(host="127.0.0.1") db = client.novel # 库名dianping collection = db.novelname import redis # 导入redis数据库 r = redis.Redis(host='127.0.0.1', port=6379, db=0) ii = 0 class qidianNovelSpider(scrapy.Spider): name = "qidianNovelWorksInfo" allowed_domains = ["qidian.com"] # 允许访问的域 def __init__(self): # global pid # 查询reids库novelurl # qidianNovelSpider.start_urls=["https://www.qidian.com/all",] start_urls = [] urlList = r.lrange('novelnameurl', 0, -1) ii = 0 self.dict = {} for item in urlList: itemStr = str(item, encoding="utf-8") arr = itemStr.split(',') classid = arr[0] pid = arr[1] url = arr[2] start_urls.append(url) self.dict[url] = {"classid": classid, "pid": pid, "num": 0} # ii += 1 # if ii > 5: # break print(start_urls) self.start_urls = start_urls def parse(self, response): classInfo = self.dict[response.url] objectid = classInfo['classid'] objectid2 = ObjectId(objectid) pid = classInfo['pid'] # num = classInfo['num'] # if num > 3: # return None html = response.body.decode('utf-8') selector = etree.HTML(html) workName = selector.xpath('//div[@class="book-info "]/h1/span/a[@class="writer"]/text()') novelName = selector.xpath('//div[@class="book-info "]/h1/em/text()') novelState = selector.xpath('//div[@class="book-info "]/p[@class="tag"]/span[@class="blue"]/text()') novelClass = selector.xpath('//div[@class="book-info "]/p[@class="tag"]/a[@class="red"]/text()') objClass=novelClass[0] sonClass=novelClass[1] print("小说名:"+novelName[0]) print("作者名:"+workName[0]) print("状态:" + novelState[0]) print("小说分类:"+objClass) print("小说分类2:" + sonClass) db.novelname.update({"_id": objectid2}, {"$set": {'workName': workName, 'novelName': novelName, 'novelState': novelState, 'objClass': objClass,'sonClass': sonClass}}) print('--------end--------------') # --------------------------------------------------------------------------------------------------------------- # def updateMongo(self, workName,novelName,novelState,objClass,sonClass,objectid2): # # classid = collection.update({'workName': workName,'novelName':novelName,'novelState':novelState,'objClass':objClass,'sonClass':sonClass,'pid': pid}) # classid = collection.update({"_id":objectid2 },{"$set":{'workName': workName, 'novelName': novelName, 'novelState': novelState, 'objClass': objClass, 'sonClass': sonClass}}) # return classid
7.创建qidianNovelChapterContent.py文件,爬取小说章节内容,更新到原有的存小说章节的mongdb(novel)库下章节表
# -*- coding: utf-8 -*- import scrapy from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from scrapy.selector import HtmlXPathSelector # from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request # from hello.items import ZhaopinItem # from scrapy.spiders import CrawlSpider, Rule from time import sleep # from scrapy.linkextractors import LinkExtractor from lxml import etree import pymongo from bson.objectid import ObjectId client = pymongo.MongoClient(host="127.0.0.1") db = client.novel # 库名dianping collection = db.novelChapterInfo import redis # 导入redis数据库 r = redis.Redis(host='127.0.0.1', port=6379, db=0) ii = 0 class qidianNovelSpider(scrapy.Spider): name = "qidianNovelChapterContent" allowed_domains = ["qidian.com"] # 允许访问的域 def __init__(self): # global pid # 查询reids库novelurl #qidianNovelSpider.start_urls=["https://read.qidian.com/chapter/kbE0tc0oVoNrZK4x-CuJuw2/92LFs_xdtPXwrjbX3WA1AA2",] start_urls = [] urlList = r.lrange('novelChapterUrl', 0,-1) ii = 0 self.dict = {} for item in urlList: itemStr = str(item, encoding="utf-8") arr = itemStr.split(',') classid = arr[0] pid = arr[1] url = arr[2] start_urls.append(url) self.dict[url] = {"classid": classid, "pid": pid, "num": 0} # ii += 1 # if ii > 10: # break # print(start_urls) self.start_urls = start_urls def parse(self, response): classInfo = self.dict[response.url] objectid = classInfo['classid'] objectid2 = ObjectId(objectid) pid = classInfo['pid'] num = classInfo['num'] ii = "" #================================================================================== html = response.body.decode('utf-8') selector = etree.HTML(html) novelChaptersContents = selector.xpath('//div[@class ="read-content j_readContent"]/p') # print(novelChaptersContent) for item in novelChaptersContents: novelChaptersContent=item.text # print(novelChaptersContent) ii = novelChaptersContent + ii # classid = collection.insert({'content': ii, 'pid': pid}) db.novelChapterInfo.update({"_id": objectid2}, {"$set": {'novelChaptersContent':ii}}) # sleep(0.3) print('------------------------------------------------------') # --------------------------------------------------------------------------------------------------------------- # def nextChapter(self, response): # hxs = HtmlXPathSelector(response) # nextChapter = hxs.select('//div[@"chapter-control dib-wrap"]/a[@id = "j_chapterNext"]') # # print(nextPage.extract()) # if len(nextChapter) == 1: # nextChapter = nextChapter.select('@href').extract() # nextChapter= "https:" + nextChapter[0] # print('==============' + nextChapter + '====================') # return nextChapter
9.运行,在项目根目录下dos执行:
scrapy crawl dmoz(对应py文件中的name=" ")
最近一直忙于手中的项目,一直没有整理,抱歉
中