From ac4fd844a478b6055af1245902e6f5eca6e4ba7e Mon Sep 17 00:00:00 2001
From: radiyyadwi <radiyyasaputra@gmail.com>
Date: Mon, 7 Aug 2017 23:20:20 +0700
Subject: [PATCH 1/2] scrapper tokopedia w scrapy on progress

---
 tokopedia/scrapy.cfg                          |  11 ++
 tokopedia/tokopedia/__init__.py               |   0
 tokopedia/tokopedia/__init__.pyc              | Bin 0 -> 167 bytes
 tokopedia/tokopedia/items.json                |   0
 tokopedia/tokopedia/items.py                  |  16 +++
 tokopedia/tokopedia/items.pyc                 | Bin 0 -> 532 bytes
 tokopedia/tokopedia/middlewares.py            |  56 ++++++++++
 tokopedia/tokopedia/pipelines.py              |  36 +++++++
 tokopedia/tokopedia/pipelines.pyc             | Bin 0 -> 1511 bytes
 tokopedia/tokopedia/settings.py               |  96 ++++++++++++++++++
 tokopedia/tokopedia/settings.pyc              | Bin 0 -> 523 bytes
 tokopedia/tokopedia/spiders/__init__.py       |   4 +
 tokopedia/tokopedia/spiders/__init__.pyc      | Bin 0 -> 175 bytes
 .../tokopedia/spiders/tokopedia_spider.py     |  25 +++++
 .../tokopedia/spiders/tokopedia_spider.pyc    | Bin 0 -> 1417 bytes
 15 files changed, 244 insertions(+)
 create mode 100644 tokopedia/scrapy.cfg
 create mode 100644 tokopedia/tokopedia/__init__.py
 create mode 100644 tokopedia/tokopedia/__init__.pyc
 create mode 100644 tokopedia/tokopedia/items.json
 create mode 100644 tokopedia/tokopedia/items.py
 create mode 100644 tokopedia/tokopedia/items.pyc
 create mode 100644 tokopedia/tokopedia/middlewares.py
 create mode 100644 tokopedia/tokopedia/pipelines.py
 create mode 100644 tokopedia/tokopedia/pipelines.pyc
 create mode 100644 tokopedia/tokopedia/settings.py
 create mode 100644 tokopedia/tokopedia/settings.pyc
 create mode 100644 tokopedia/tokopedia/spiders/__init__.py
 create mode 100644 tokopedia/tokopedia/spiders/__init__.pyc
 create mode 100644 tokopedia/tokopedia/spiders/tokopedia_spider.py
 create mode 100644 tokopedia/tokopedia/spiders/tokopedia_spider.pyc

diff --git a/tokopedia/scrapy.cfg b/tokopedia/scrapy.cfg
new file mode 100644
index 00000000..44d8f940
--- /dev/null
+++ b/tokopedia/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = tokopedia.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = tokopedia
diff --git a/tokopedia/tokopedia/__init__.py b/tokopedia/tokopedia/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tokopedia/tokopedia/__init__.pyc b/tokopedia/tokopedia/__init__.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..771cd14e86b34790d21745b22ce1562c2ead91ee
GIT binary patch
literal 167
zcmZSn%*$oo+7y}000oRd+5w1*S%5?e14FO|NW@PANHCxg#Q{Ju{oK@)%tZa7#FWg+
z%EXlNOnsNcl0^N0%$&pwAd#8jo|uxJTB0AET$ET)kXoc)lAoPl0M>(G#m8snWtPOp
Z>lIX%Z~zUp$<0qG%}KQbSzQdo3;-bRD4_rV

literal 0
HcmV?d00001

diff --git a/tokopedia/tokopedia/items.json b/tokopedia/tokopedia/items.json
new file mode 100644
index 00000000..e69de29b
diff --git a/tokopedia/tokopedia/items.py b/tokopedia/tokopedia/items.py
new file mode 100644
index 00000000..89b7ffc4
--- /dev/null
+++ b/tokopedia/tokopedia/items.py
@@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+from scrapy.item import Item, Field
+
+
+class TokopediaItem(Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    name = Field()
+    price = Field()
+    url = Field()
diff --git a/tokopedia/tokopedia/items.pyc b/tokopedia/tokopedia/items.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d13ea8337fb9505ea692667e8c3495ce5192961d
GIT binary patch
literal 532
zcmcIg%TB{E5M0M;MT>gn2e{-^bK`;#LI}hGgpyvGOJv1{S|l`zU7=igd@&yYvu=y{
z0W9x$ylbz=Bfn<T)6chMO}innULgDlz+eh$h+09Vpf;g0Az>1-f*u7KNgqV-M9Wf=
zNvX(2xa7VuvIH!ofZJf(!1-%u8xv+=i}mxmv&Lzz{k|+vxkUJ5fF*v=Cfps(cO>1D
z5#>dOW0Z{n;Q&xoFS@l=6{d01uFa;wHs`ic2$!Rfu6MN!68TNvY`cI{Wd06JwJc8c
z+PGoR=GCby9khCIja~uV>PDL<8&pyEy6bGO!u~^ln8V-vyc=?M;t=6k>R3+1MBHcW
gOo&<TG0gdNjvPh3@y@?1i?*F@jXGxwJxe{3KRdv8zyJUM

literal 0
HcmV?d00001

diff --git a/tokopedia/tokopedia/middlewares.py b/tokopedia/tokopedia/middlewares.py
new file mode 100644
index 00000000..f8356fad
--- /dev/null
+++ b/tokopedia/tokopedia/middlewares.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class TokopediaSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
diff --git a/tokopedia/tokopedia/pipelines.py b/tokopedia/tokopedia/pipelines.py
new file mode 100644
index 00000000..2157426d
--- /dev/null
+++ b/tokopedia/tokopedia/pipelines.py
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+import pymongo
+
+from scrapy.conf import settings
+from scrapy.exceptions import DropItem
+from scrapy import log
+
+
+class TokopediaPipeline(object):
+
+    def __init__(self):
+        connection = pymongo.MongoClient(
+            settings['MONGODB_SERVER'],
+            settings['MONGODB_PORT']
+        )
+        db = connection[settings['MONGODB_DB']]
+        self.collection = db[settings['MONGODB_COLLECTION']]
+
+    def process_item(self, item, spider):
+        valid = True
+        for data in item:
+            if not data:
+                valid = False
+                raise DropItem("Missing {0}!".format(data))
+        if valid:
+            self.collection.insert(dict(item))
+            log.msg("Item added to MongoDB database!",
+                    level=log.DEBUG, spider=spider)
+        return item
diff --git a/tokopedia/tokopedia/pipelines.pyc b/tokopedia/tokopedia/pipelines.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c6c698d17c6174a5178e516f2e59148b0db167d4
GIT binary patch
literal 1511
zcmcgs+iuf95S_Ih_d-*Zi;x0Hq;D1v1n~?AB@R`kk`yNtH4?I1dy{ThU+`{A1IY`8
z-{X-V;R9f19cX@_q~7t&?s#_QoSE9c>YG1*d_Rt1`Lpr;5|=tck>JBX0*or%0?8n?
zU}OQKGaIB0sRJXYvK^2$NL?7YXxD(R!I(kf!Z(0RfaB3Rg&TNl7+U<sU(aKx>UFx0
zYb~;gy4vx}yx7+~t#&9cnNRdhd=K*vdBGDA9g2cWk@0`oQ@A!oyN7~f-grIG^_VeU
zO^OY{z_ai;CT9xk;OgK~&rxWADFbcX8}zcEacRMV!MxhGq0v}aFt?18gVr1%wCSFK
zSHlQBy^Zp9zqi--+u@)a9CZV|wX${C4~BYkWy5dlTPtR#e{j(44EOszk5bbFS20WT
zY?A96<VRXZ3ZCfzO)#<ITuL5mk!K#&r0Gk=<#>6RAxLzH6(MgY9!fAmy+N&&5}K_t
zO2ljyC7(pgk95>J6f!zRAx`(AWWsf85SLL=@Uo@<GO(f*OET2sVrD1}Ly-v`hE$uP
zdV#rYjn%Df7E_J{mW#_^{}~m;eqqDKQ-a!m{|FWi%pFj5(2Nc$Ql>U-UNU;VGWb}h
z=iS_f@*I$fsRPp*XqRGCP7DE@V&@^37z6bUNHDO4#}><g9GAo?!;(-68?pK2$zoSM
z#H^?Xn^BVRq^a|!0pD*o69niaQhZla8IqrKiI-xbg-CemQMrbLVR^=l%d1E#9+-Y7
zzj0oskq(R>*&<WC)aH{!#5y3JG@+ELiTU2|w%_iV1`DVOjHkfVf*rCbiStN`q>6ya
zT|31DR#E0LS1P1LXynu#6hw0!m&+dDy34lMZB}(M4SE=6QOd*6G-ep4d2%Mv3}|SA
z`zWpwzs3d)2wFl;(V)22#=5tGN<bN!Ayd@qv&V6sjjJJ}e>}@gei8G6rb%7#Sshih
g5uf?V6erGH^Ir-w6Zfn_^97ZnsFW>hhvA?72jWUn2><{9

literal 0
HcmV?d00001

diff --git a/tokopedia/tokopedia/settings.py b/tokopedia/tokopedia/settings.py
new file mode 100644
index 00000000..c6d49df3
--- /dev/null
+++ b/tokopedia/tokopedia/settings.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for tokopedia project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'tokopedia'
+
+SPIDER_MODULES = ['tokopedia.spiders']
+NEWSPIDER_MODULE = 'tokopedia.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'tokopedia (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'tokopedia.middlewares.TokopediaSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'tokopedia.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'tokopedia.pipelines.TokopediaPipeline': 300,
+#}
+ITEM_PIPELINES = ['tokopedia.pipelines.TokopediaPipeline']
+
+MONGODB_SERVER = "localhost"
+MONGODB_PORT = 27017
+MONGODB_DB = "tokopedia"
+MONGODB_COLLECTION = "item"
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/tokopedia/tokopedia/settings.pyc b/tokopedia/tokopedia/settings.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc624bbc5445ec6d4861f8a150f011abd925a590
GIT binary patch
literal 523
zcmY*V-AcnS82#F9-8xj%cMx{9`T!!@HH<>rl(zn?Hzh1#3wCR1GGtd?dEwLfFus6E
z*|bZ@$#)LlclbEHpY!p@_vc~>hhwP!K|P^?0^|w}SQ(HTy2wpnJ;?h&%~l)$Rn!97
zhkV%fkAMy!w}D!aPKL@v+8V02s%DfEan)s0QFF=~A#P68n%Au4TPCL2aV|bcty@_w
zNx80s<X>EQ_th|$Y!gk!+GeD{#aR(OhDcs0lE%J=k|M<3Q-D%=siX+KoT#lw*6dh2
zkYF{RX0KU+U9@N=KSN;=`!NdqNO@@65Jz+DxkZYSXOzgZjuhi0lX{tBkLSwk4xQm3
zK(oxp(L|~64<c67+32L4hETrW6Z*~_k4WOgyd-M{d_5<0#iWxi8&cP-asJZnRDwy#
hw<|HN_iZ9~n~LsAcCXhIIs(hI4QpuK+NNdb`wPA9f42Yt

literal 0
HcmV?d00001

diff --git a/tokopedia/tokopedia/spiders/__init__.py b/tokopedia/tokopedia/spiders/__init__.py
new file mode 100644
index 00000000..ebd689ac
--- /dev/null
+++ b/tokopedia/tokopedia/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/tokopedia/tokopedia/spiders/__init__.pyc b/tokopedia/tokopedia/spiders/__init__.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d7260a02f3b76f60f7171909a362fa8e62fe614
GIT binary patch
literal 175
zcmZSn%*$oo+7y}000oRd+5w1*S%5?e14FO|NW@PANHCxg#SuU;{oK@)%tZa7#FWg+
z%EXlNOnsNcl0^N0%$&pwAd#8jo|uxJTB0AET$ET)kXoc)lAoPl0M>(G6&GZtq!tzH
h$H!;pWtPOp>lIX%Z~zUr$<0qG%}KQbIiVPc833O*E7AY}

literal 0
HcmV?d00001

diff --git a/tokopedia/tokopedia/spiders/tokopedia_spider.py b/tokopedia/tokopedia/spiders/tokopedia_spider.py
new file mode 100644
index 00000000..08e3d712
--- /dev/null
+++ b/tokopedia/tokopedia/spiders/tokopedia_spider.py
@@ -0,0 +1,25 @@
+from scrapy import Spider
+from scrapy.selector import Selector
+
+from tokopedia.items import TokopediaItem
+
+
+class TokopediaSpider(Spider):
+    name = "tokopedia"
+    allowed_domains = ["tokopedia.com"]
+    start_urls = [
+        "https://www.tokopedia.com/p/handphone-tablet/handphone?page=1",
+    ]
+
+    def parse(self, response):
+        cards = Selector(response).xpath('//div[@class="product-summary"]')
+
+        for card in cards:
+            item = TokopediaItem()
+            item['name'] = question.xpath(
+                'div[@class="product-name ng-binding"]/text()').extract()[0]
+            item['price'] = question.xpath(
+                'div[@class="product-price ng-binding"]/text()').extract()[0]
+            item['url'] = question.xpath(
+                'a[@class="ng-href"]/@href').extract()[0]
+            yield item
diff --git a/tokopedia/tokopedia/spiders/tokopedia_spider.pyc b/tokopedia/tokopedia/spiders/tokopedia_spider.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd657fd2093febb1a2d4b4ebfae2f0f9919ce757
GIT binary patch
literal 1417
zcmc&!-D}i95T8wQ*IuD|4~i57rB5k%7krggixfm3isVG;NlCZa?PYDgn%VPu&^}e4
z{m=Yo`~!4mQ?HfQR}I<8eC^I}KC=Fg)s=(qU-weDe0=(D)514I6ukm+fI`4nfMD5!
z+=If0vERx*<N*{N7<WhxKnpk$pgQmc-~wQ8+y&KzQMg3MnV<B8f;ihHGOCTzO)D?6
ztaF{(>Qb{xns?Q4RcmGBYpV-;_qwLtynxMzzNCd4L^Lp30z38?|AfH%44jz&l+V}*
zZafMD^0qxDzat^XT}Z>UDsYQ5nYA@Oi=x?VHvGdH)lnu(RcBSH2UZ^D+FmJM)N-P?
zo~9n1(4ocQ7&3k%Vgb$sTnP9?SRHfj!70L-2Rr-6aPEWkxP<2cSl>C3NZtV}DT%c+
z$|Zo!OJnDzvAZxX-83%4CLQozoIHdNRH{+PW!S|>M59QV&mXtbTq16**G;9SsU6_7
zC}cBV-^VB9bDKvg3;np93=U1HOgUNKkE}km;f5i9u$(-5;NUOY>&B$F?zdb!XT>$*
z<N_MS${Kw{fwuWel>nGFdC2~;n=|H4YiYBXdtiCYPNo{IsY=TuN%$I>TDQX?M4cZw
zw?<=Km8f0tR5l8oojJ!C8p)7O07vkExTxS+jT))Ud@j|@L_5;T=&i|RM#N;Vq?*ut
zjM7HdwQi!tBwgds{S|nnNm|XYp0{<74eC1-4et>Fao<}NJ<$~p7AxeR%jJ+HT<|2Z
zE2K%1f*P5V7;{zJ?<DiQnrW4&s*t8c=Z97{HlZ?N+$X=Q9RFRSF*lOigUdv^lv&T~
z`Md5CaaRX&1?Mpt+dNu%NX>IMNZWh9FBTOZa=*|`fjj+wq~&_D*>>SMk0)}c`l9dk
F{a-#BV1EDr

literal 0
HcmV?d00001


From c6a8c2e300553ae9bc01e25378ed7b6fb55146cf Mon Sep 17 00:00:00 2001
From: rizkiduwinanto <rizkiduwinanto@gmail.com>
Date: Mon, 7 Aug 2017 23:26:16 +0700
Subject: [PATCH 2/2] Tambah Scrapper

---
 Scrapper.py | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 Scrapper.py

diff --git a/Scrapper.py b/Scrapper.py
new file mode 100644
index 00000000..b7c25b59
--- /dev/null
+++ b/Scrapper.py
@@ -0,0 +1,40 @@
+from bs4 import BeautifulSoup
+from urllib import urlopen
+import pymysql
+
+#link bukalapak
+bukalapak = "https://www.bukalapak.com/c/handphone?source=navbar&from=navbar_categories"
+
+#buka bukalapak
+bukalapakClient = urlopen(bukalapak)
+html_page = bukalapakClient.read()
+bukalapakClient.close()
+
+#buka database
+db = pymysql.connect(db='base', user='root', passwd='pwd', unix_socket="/tmp/mysql.sock")
+cursor = db.cursor()
+cursor.execute("DROP TABLE IF EXISTS database1")
+
+sql = """CREATE TABLE database1 (Nama Varchar(100), Harga Int )"""
+
+
+
+#scrap harga dan nama dari bukalapak
+bukalapakPage = BeautifulSoup(html_page, "html5lib")
+products = bukalapakPage.find_all("li", class_ = "product--sem col-12--2")
+for product in products :
+    product_name = product.div.article.div.a["title"]
+    product_price = product.find("div", class_ = "product-price").find("span", class_ = "amount positive").text
+    #insert to table
+    try:
+        cursor.execute("""INSERT INTO database1 VALUES (%s,%s)""",(product_name,product_price))
+        db.commit()
+    except:
+        db.rollback()
+
+#showtable
+cursor.execute("""SELECT * FROM database1;""")
+
+print cursor.fetchall()
+
+db.close()