From ac4fd844a478b6055af1245902e6f5eca6e4ba7e Mon Sep 17 00:00:00 2001 From: radiyyadwi Date: Mon, 7 Aug 2017 23:20:20 +0700 Subject: [PATCH 1/2] scrapper tokopedia w scrapy on progress --- tokopedia/scrapy.cfg | 11 ++ tokopedia/tokopedia/__init__.py | 0 tokopedia/tokopedia/__init__.pyc | Bin 0 -> 167 bytes tokopedia/tokopedia/items.json | 0 tokopedia/tokopedia/items.py | 16 +++ tokopedia/tokopedia/items.pyc | Bin 0 -> 532 bytes tokopedia/tokopedia/middlewares.py | 56 ++++++++++ tokopedia/tokopedia/pipelines.py | 36 +++++++ tokopedia/tokopedia/pipelines.pyc | Bin 0 -> 1511 bytes tokopedia/tokopedia/settings.py | 96 ++++++++++++++++++ tokopedia/tokopedia/settings.pyc | Bin 0 -> 523 bytes tokopedia/tokopedia/spiders/__init__.py | 4 + tokopedia/tokopedia/spiders/__init__.pyc | Bin 0 -> 175 bytes .../tokopedia/spiders/tokopedia_spider.py | 25 +++++ .../tokopedia/spiders/tokopedia_spider.pyc | Bin 0 -> 1417 bytes 15 files changed, 244 insertions(+) create mode 100644 tokopedia/scrapy.cfg create mode 100644 tokopedia/tokopedia/__init__.py create mode 100644 tokopedia/tokopedia/__init__.pyc create mode 100644 tokopedia/tokopedia/items.json create mode 100644 tokopedia/tokopedia/items.py create mode 100644 tokopedia/tokopedia/items.pyc create mode 100644 tokopedia/tokopedia/middlewares.py create mode 100644 tokopedia/tokopedia/pipelines.py create mode 100644 tokopedia/tokopedia/pipelines.pyc create mode 100644 tokopedia/tokopedia/settings.py create mode 100644 tokopedia/tokopedia/settings.pyc create mode 100644 tokopedia/tokopedia/spiders/__init__.py create mode 100644 tokopedia/tokopedia/spiders/__init__.pyc create mode 100644 tokopedia/tokopedia/spiders/tokopedia_spider.py create mode 100644 tokopedia/tokopedia/spiders/tokopedia_spider.pyc diff --git a/tokopedia/scrapy.cfg b/tokopedia/scrapy.cfg new file mode 100644 index 00000000..44d8f940 --- /dev/null +++ b/tokopedia/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.org/en/latest/deploy.html + +[settings] +default = tokopedia.settings + +[deploy] +#url = http://localhost:6800/ +project = tokopedia diff --git a/tokopedia/tokopedia/__init__.py b/tokopedia/tokopedia/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tokopedia/tokopedia/__init__.pyc b/tokopedia/tokopedia/__init__.pyc new file mode 100644 index 0000000000000000000000000000000000000000..771cd14e86b34790d21745b22ce1562c2ead91ee GIT binary patch literal 167 zcmZSn%*$oo+7y}000oRd+5w1*S%5?e14FO|NW@PANHCxg#Q{Ju{oK@)%tZa7#FWg+ z%EXlNOnsNcl0^N0%$&pwAd#8jo|uxJTB0AET$ET)kXoc)lAoPl0M>(G#m8snWtPOp Z>lIX%Z~zUp$<0qG%}KQbSzQdo3;-bRD4_rV literal 0 HcmV?d00001 diff --git a/tokopedia/tokopedia/items.json b/tokopedia/tokopedia/items.json new file mode 100644 index 00000000..e69de29b diff --git a/tokopedia/tokopedia/items.py b/tokopedia/tokopedia/items.py new file mode 100644 index 00000000..89b7ffc4 --- /dev/null +++ b/tokopedia/tokopedia/items.py @@ -0,0 +1,16 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/items.html + +from scrapy.item import Item, Field + + +class TokopediaItem(Item): + # define the fields for your item here like: + # name = scrapy.Field() + name = Field() + price = Field() + url = Field() diff --git a/tokopedia/tokopedia/items.pyc b/tokopedia/tokopedia/items.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d13ea8337fb9505ea692667e8c3495ce5192961d GIT binary patch literal 532 zcmcIg%TB{E5M0M;MT>gn2e{-^bK`;#LI}hGgpyvGOJv1{S|l`zU7=igd@&yYvu=y{ z0W9x$ylbz=Bfn1-f*u7KNgqV-M9Wf= zNvX(2xa7VuvIH!ofZJf(!1-%u8xv+=i}mxmv&Lzz{k|+vxkUJ5fF*v=Cfps(cO>1D z5#>dOW0Z{n;Q&xoFS@l=6{d01uFa;wHs`ic2$!Rfu6MN!68TNvY`cI{Wd06JwJc8c z+PGoR=GCby9khCIja~uV>PDL<8&pyEy6bGO!u~^ln8V-vyc=?M;t=6k>R3+1MBHcW gOo&YG1*d_Rt1`Lpr;5|=tck>JBX0*or%0?8n? zU}OQKGaIB0sRJXYvK^2$NL?7YXxD(R!I(kf!Z(0RfaB3Rg&TNl7+UUFx0 zYb~;gy4vx}yx7+~t#&9cnNRdhd=K*vdBGDA9g2cWk@0`oQ@A!oyN7~f-grIG^_VeU zO^OY{z_ai;CT9xk;OgK~&rxWADFbcX8}zcEacRMV!MxhGq0v}aFt?18gVr1%wCSFK zSHlQBy^Zp9zqi--+u@)a9CZV|wX${C4~BYkWy5dlTPtR#e{j(44EOszk5bbFS20WT zY?A966RAxLzH6(MgY9!fAmy+N&&5}K_t zO2ljyC7(pgk95>J6f!zRAx`(AWWsf85SLL=@Uo@U-UNU;VGWb}h z=iS_f@*I$fsRPp*XqRGCP7DE@V&@^37z6bUNHDO4#}>*&6ya zT|31DR#E0LS1P1LXynu#6hw0!m&+dDy34lMZB}(M4SE=6QOd*6G-ep4d2%Mv3}|SA z`zWpwzs3d)2wFl;(V)22#=5tGN}@gei8G6rb%7#Sshih g5uf?V6erGH^Ir-w6Zfn_^97ZnsFW>hhvA?72jWUn2><{9 literal 0 HcmV?d00001 diff --git a/tokopedia/tokopedia/settings.py b/tokopedia/tokopedia/settings.py new file mode 100644 index 00000000..c6d49df3 --- /dev/null +++ b/tokopedia/tokopedia/settings.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for tokopedia project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# http://doc.scrapy.org/en/latest/topics/settings.html +# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'tokopedia' + +SPIDER_MODULES = ['tokopedia.spiders'] +NEWSPIDER_MODULE = 'tokopedia.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'tokopedia (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'tokopedia.middlewares.TokopediaSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'tokopedia.middlewares.MyCustomDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# 'tokopedia.pipelines.TokopediaPipeline': 300, +#} +ITEM_PIPELINES = ['tokopedia.pipelines.TokopediaPipeline'] + +MONGODB_SERVER = "localhost" +MONGODB_PORT = 27017 +MONGODB_DB = "tokopedia" +MONGODB_COLLECTION = "item" + +# Enable and configure the AutoThrottle extension (disabled by default) +# See http://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/tokopedia/tokopedia/settings.pyc b/tokopedia/tokopedia/settings.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc624bbc5445ec6d4861f8a150f011abd925a590 GIT binary patch literal 523 zcmY*V-AcnS82#F9-8xj%cMx{9`T!!@HH<>rl(zn?Hzh1#3wCR1GGtd?dEwLfFus6E z*|bZ@$#)LlclbEHpY!p@_vc~>hhwP!K|P^?0^|w}SQ(HTy2wpnJ;?h&%~l)$Rn!97 zhkV%fkAMy!w}D!aPKL@v+8V02s%DfEan)s0QFF=~A#P68n%Au4TPCL2aV|bcty@_w zNx80sEQ_th|$Y!gk!+GeD{#aR(OhDcs0lE%J=k|M<3Q-D%=siX+KoT#lw*6dh2 zkYF{RX0KU+U9@N=KSN;=`!NdqNO@@65Jz+DxkZYSXOzgZjuhi0lX{tBkLSwk4xQm3 zK(oxp(L|~64(G6&GZtq!tzH h$H!;pWtPOp>lIX%Z~zUr$<0qG%}KQbIiVPc833O*E7AY} literal 0 HcmV?d00001 diff --git a/tokopedia/tokopedia/spiders/tokopedia_spider.py b/tokopedia/tokopedia/spiders/tokopedia_spider.py new file mode 100644 index 00000000..08e3d712 --- /dev/null +++ b/tokopedia/tokopedia/spiders/tokopedia_spider.py @@ -0,0 +1,25 @@ +from scrapy import Spider +from scrapy.selector import Selector + +from tokopedia.items import TokopediaItem + + +class TokopediaSpider(Spider): + name = "tokopedia" + allowed_domains = ["tokopedia.com"] + start_urls = [ + "https://www.tokopedia.com/p/handphone-tablet/handphone?page=1", + ] + + def parse(self, response): + cards = Selector(response).xpath('//div[@class="product-summary"]') + + for card in cards: + item = TokopediaItem() + item['name'] = question.xpath( + 'div[@class="product-name ng-binding"]/text()').extract()[0] + item['price'] = question.xpath( + 'div[@class="product-price ng-binding"]/text()').extract()[0] + item['url'] = question.xpath( + 'a[@class="ng-href"]/@href').extract()[0] + yield item diff --git a/tokopedia/tokopedia/spiders/tokopedia_spider.pyc b/tokopedia/tokopedia/spiders/tokopedia_spider.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd657fd2093febb1a2d4b4ebfae2f0f9919ce757 GIT binary patch literal 1417 zcmc&!-D}i95T8wQ*IuD|4~i57rB5k%7krggixfm3isVG;NlCZa?PYDgn%VPu&^}e4 z{m=Yo`~!4mQ?HfQR}I<8eC^I}KC=Fg)s=(qU-weDe0=(D)514I6ukm+fI`4nfMD5! z+=If0vERx*Qb{xns?Q4RcmGBYpV-;_qwLtynxMzzNCd4L^Lp30z38?|AfH%44jz&l+V}* zZafMD^0qxDzat^XT}Z>UDsYQ5nYA@Oi=x?VHvGdH)lnu(RcBSH2UZ^D+FmJM)N-P? zo~9n1(4ocQ7&3k%Vgb$sTnP9?SRHfj!70L-2Rr-6aPEWkxP<2cSl>C3NZtV}DT%c+ z$|Zo!OJnDzvAZxX-83%4CLQozoIHdNRH{+PW!S|>M59QV&mXtbTq16**G;9SsU6_7 zC}cBV-^VB9bDKvg3;np93=U1HOgUNKkE}km;f5i9u$(-5;NUOY>&B$F?zdb!XT>$* znGFdC2~;n=|H4YiYBXdtiCYPNo{IsY=TuN%$I>TDQX?M4cZw zw?<=Km8f0tR5l8oojJ!C8p)7O07vkExTxS+jT))Ud@j|@L_5;T=&i|RM#N;Vq?*ut zjM7HdwQi!tBwgds{S|nnNm|XYp0{<74eC1-4et>Fao<}NJ<$~p7AxeR%jJ+HT<|2Z zE2K%1f*P5V7;{zJ?IMNZWh9FBTOZa=*|`fjj+wq~&_D*>>SMk0)}c`l9dk F{a-#BV1EDr literal 0 HcmV?d00001 From c6a8c2e300553ae9bc01e25378ed7b6fb55146cf Mon Sep 17 00:00:00 2001 From: rizkiduwinanto Date: Mon, 7 Aug 2017 23:26:16 +0700 Subject: [PATCH 2/2] Tambah Scrapper --- Scrapper.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 Scrapper.py diff --git a/Scrapper.py b/Scrapper.py new file mode 100644 index 00000000..b7c25b59 --- /dev/null +++ b/Scrapper.py @@ -0,0 +1,40 @@ +from bs4 import BeautifulSoup +from urllib import urlopen +import pymysql + +#link bukalapak +bukalapak = "https://www.bukalapak.com/c/handphone?source=navbar&from=navbar_categories" + +#buka bukalapak +bukalapakClient = urlopen(bukalapak) +html_page = bukalapakClient.read() +bukalapakClient.close() + +#buka database +db = pymysql.connect(db='base', user='root', passwd='pwd', unix_socket="/tmp/mysql.sock") +cursor = db.cursor() +cursor.execute("DROP TABLE IF EXISTS database1") + +sql = """CREATE TABLE database1 (Nama Varchar(100), Harga Int )""" + + + +#scrap harga dan nama dari bukalapak +bukalapakPage = BeautifulSoup(html_page, "html5lib") +products = bukalapakPage.find_all("li", class_ = "product--sem col-12--2") +for product in products : + product_name = product.div.article.div.a["title"] + product_price = product.find("div", class_ = "product-price").find("span", class_ = "amount positive").text + #insert to table + try: + cursor.execute("""INSERT INTO database1 VALUES (%s,%s)""",(product_name,product_price)) + db.commit() + except: + db.rollback() + +#showtable +cursor.execute("""SELECT * FROM database1;""") + +print cursor.fetchall() + +db.close()