Commit c9366674 authored by Joschka Hüllmann's avatar Joschka Hüllmann

initial commit

parents
# -*- coding: utf-8 -*-
"""The purpose of this module is to implement youtubeLiker"""
import pickle
import time
from selenium import webdriver
from configparser import SafeConfigParser
from support import *
class CookieDumper():
def cookieDumper(stringIDs, pages, delay):
"""
The cookieDumper helps dumping session cookies for each profile. The persistent cookies are saved in the Firefox profiles.
Dumping session cookies will help logging into the websites and is mandatory for automisedProfileValidation to work.
This method only has to be used the first time.
Args:
stringIDs: string Array of the names of the profiles
pages: set for which pages cookies should be saved
delay: set a timer until the cookies get dumped (you will need to log-in within that timeframe)
Returns: Cookiefile (.pkl) like 'stringID + page + Cookies.pkl'
"""
_config = SafeConfigParser()
_config.read('config.ini')
_cookieDirectory = _config.get('directories', 'cookieDirectory')
_profileDirectory = _config.get('directories', 'profileDirectory')
for _usr in stringIDs:
_tmpProfile = webdriver.FirefoxProfile(_profileDirectory + _config.get('profiles', _usr))
_tmpDriver = webdriver.Firefox(firefox_profile=_tmpProfile, capabilities=support.proxy(_config.get('proxy', _usr)))
for _p in pages:
_tmpDriver.get('https://' + _p)
time.sleep(delay)
_p = _p.title()
pickle.dump(_tmpDriver.get_cookies() , open((_cookieDirectory +_usr+ _p+ "Cookies.pkl"),"wb"))
print('cookies dumped')
_tmpDriver.quit()
"""
Select the profile you want to dump cookies for and the website. Then choose a delay that allows you
to log into the profile account on that website. After the delay the cookies will be dumped.
Clicking on Browser options like "Save this password" will save the log-in cookies in the Firefox profile.
Which allows you to log-in without loading the session cookies and you can choose a very short delay.
"""
CookieDumper.cookieDumper(["Hannes", "Daniel", "Linda", "Hildegard"], ["cyberport.de", "esprit.de", "zalando.de", "mediamarkt.de", "amazon.de", "tchibo.de", "otto.de"], 1)
# -*- coding: utf-8 -*-
"""
The purpose of this module is to alter the *-statement for the liker folder,
so it only includes modules specified in the config
"""
from configparser import SafeConfigParser
_config = SafeConfigParser()
_config.read('config.ini')
_methods = _config.get('methods', 'liker').split()
_allMethods=[]
for i in range(len(_methods)):
_allMethods.append(_methods[i])
__all__ = _allMethods
# -*- coding: utf-8 -*-
"""The purpose of this module is to implement redditLiker"""
import time
import random
from random import randint
from selenium.common.exceptions import TimeoutException
class bildLiker():
def bildLiker(driver, searchTerms):
"""
This method is responsible for automising Bild activity.
Each serch term is being used in the Bild search by adding it to the URL.
On the result page 2 of the shown articles will be randomly read for 5 seconds.
Args:
driver: selenium webdriver object (contains profile)
searchTerms: string list of the profiles interests
Raises:
TimeoutException: if the site did not load in time
"""
for _searchTerm in searchTerms:
_searchString = "https://www.bild.de/suche.bild.html?query=" + _searchTerm
try:
driver.get(_searchString)
print ("Page is ready!")
except TimeoutException:
print ("Loading took too much time!")
_randomNumber = random.sample(range(0, 9), 2)
_randomNumber.sort()
for _ran in _randomNumber:
_news = driver.find_elements_by_xpath("/html/body/div[2]/div[2]/div[5]/div/section/ol/li/div/a/img")
_news[_ran].click()
time.sleep(3)
_loop = True
#Sometimes the ".back"-Method does not work properly if the site has pop-ups, so it will be looped until it works.
while _loop:
try:
driver.back()
except TimeoutException:
print ("Loading took too much time!")
time.sleep(3)
if ".de/suche" in driver.current_url:
_loop = False
# -*- coding: utf-8 -*-
"""The purpose of this module is to implement redditLiker"""
import time
import random
from random import randint
from selenium.common.exceptions import TimeoutException
class facebookLiker():
def facebookLiker(driver, searchTerms):
"""
This method is responsible for automising facebook activity.
Each serch term is being used in the facebook search by adding it to the URL.
Then a variety of likebuttons are pressed. 1 of the 3 follower options and all related posts will be liked.
Args:
driver: selenium webdriver object (contains profile)
searchTerms: string list of the profiles interests
Raises:
TimeoutException: if the site did not load in time
Exception: if an element could not be found
"""
for _searchTerm in searchTerms:
_searchString = "https://www.facebook.com/search/top/?q=" + _searchTerm
try:
driver.get(_searchString)
print ("Page is ready!")
except TimeoutException:
print ("Loading took too much time!")
_likeButtons1 = driver.find_elements_by_xpath("/html/body/div[1]/div[3]/div[1]/div/div[3]/div[2]/div/div/div[3]/div/div/div/div[1]/div/div/div/div/div/div[2]/div/div/div[1]/div")
try:
_likeButtons1[randint(0, 2)].click()
except Exception:
print ("LikeButton1 could not be scrolled into view")
driver.get("https://www.facebook.com/")
_likeButtons2 = driver.find_elements_by_xpath("/html/body/div[1]/div[3]/div[1]/div/div[2]/div[2]/div[1]/div[2]/div/div[5]/div/div/div[1]/div/div/div/div/div[1]/div/div[3]/div[2]/form/div[1]/div/div/div/div[2]/div/div/span[1]")
driver.execute_script("window.scrollTo(0, 1200)")
_likeButtons3 = driver.find_elements_by_xpath("/html/body/div[1]/div[3]/div[1]/div/div[2]/div[2]/div[1]/div[2]/div/div[5]/div/div/div/div[1]/div[1]/div/div/div/div/div/div/div[3]/div[2]/form/div[1]/div/div/div/div[2]/div/div/span[1]")
_likeButtons4 = driver.find_elements_by_xpath("/html/body/div[1]/div[3]/div[1]/div/div[2]/div[2]/div[1]/div[2]/div/div[5]/div/div/div/div/div/div/div/div[1]/div/ul/li/div/div[3]/div[2]/form/div[1]/div/div/div/div[2]/div/div/span[1]")
_likeButtons = _likeButtons2 + _likeButtons3 + _likeButtons4
for _btn in _likeButtons:
try:
_btn.click()
except Exception:
print ("Button could not be found")
# -*- coding: utf-8 -*-
"""The purpose of this module is to implement youtubeLiker"""
import time
import random
from random import randint
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
class googleLiker():
def googleLiker(driver, searchTerms):
"""
This method is responsible for automising google activity.
Each serch term is being used in google search by writing it into the search bar and pressing enter.
On the result page 2 hits will be clicked. (Headlines prioritised)
several sleeps for letting the site load + it fakes realistic behaviour better
Args:
driver: selenium webdriver object (contains profile)
searchTerms: string list of the profiles interests
Raises:
TimeoutException: if the site did not load in time
Exception: if an element could not be found
"""
for _searchTerm in searchTerms:
_searchBar = driver.find_element_by_xpath("//*[@id='lst-ib']")
_searchBar.clear()
_searchBar.send_keys(_searchTerm)
_searchBar.send_keys(Keys.ENTER)
time.sleep(5)
"""
_news1 are slim Headlines, _news2 are normal results and News are rectangle headlines. Priority: _news>_news1>_news2
_news are searched for twice: Once for their numbers and in the second call to click them. This has to be done again,
because Selenium will identify them as different elements after the driver.back call
"""
_news1 = driver.find_elements_by_xpath("/html/body/div[5]/div[3]/div[10]/div[1]/div[2]/div/div[2]/div[2]/div/div/div/div/div/g-section-with-header/div[2]/div/g-scrolling-carousel/div/div/div/div/g-inner-card/a/div[2]/div")
_news2 = driver.find_elements_by_xpath("/html/body/div[5]/div[3]/div[10]/div[1]/div[2]/div/div[2]/div[2]/div/div/div/div/div/div/div/div/h3/a")
_news = driver.find_elements_by_xpath("/html/body/div[5]/div[3]/div[10]/div[1]/div[2]/div/div[2]/div[2]/div/div/div/div/div/g-section-with-header/div[2]/div/g-inner-card/div/g-card-section/a/div/span")
if len(_news)<=1:
_news = _news1
if len(_news)==0:
_news = _news2
_randomNumber = random.sample(range(0, (len(_news))), 2)
_randomNumber.sort()
for _ran in _randomNumber:
_news1 = driver.find_elements_by_xpath("/html/body/div[5]/div[3]/div[10]/div[1]/div[2]/div/div[2]/div[2]/div/div/div/div/div/g-section-with-header/div[2]/div/g-scrolling-carousel/div/div/div/div/g-inner-card/a/div[2]/div")
_news2 = driver.find_elements_by_xpath("/html/body/div[5]/div[3]/div[10]/div[1]/div[2]/div/div[2]/div[2]/div/div/div/div/div/div/div/div/h3/a")
_news = driver.find_elements_by_xpath("/html/body/div[5]/div[3]/div[10]/div[1]/div[2]/div/div[2]/div[2]/div/div/div/div/div/g-section-with-header/div[2]/div/g-inner-card/div/g-card-section/a/div/span")
if len(_news)==0:
_news = _news1
if len(_news)==0:
_news = _news2
try:
_news[_ran].click()
except Exception:
print ("GoogleLiker: Not enough interesting news")
time.sleep(5)
#Sometimes the ".back"-Method does not work properly if the site has pop-ups, so it will be looped until it works.
_loop = True
while _loop:
try:
driver.back()
except TimeoutException:
print ("Loading took too much time!")
time.sleep(3)
if "google.com" in driver.current_url:
_loop = False
# -*- coding: utf-8 -*-
"""The purpose of this module is to implement redditLiker"""
import time
import random
from random import randint
from selenium.common.exceptions import TimeoutException
class redditLiker():
def redditLiker(driver, searchTerms):
"""
This method is responsible for automising reddit activity.
Each serch term is being used in the reddit search by adding it to the URL.
Then 2 of the 3 shown subscribe options will be clicked and 1/3 of all shown inserations liked.
There are a bunch of sleeps, because even though ".get" waits for the entire page to load,
sometimes the buttons load a little bit later. Also it fakes real behaviour better.
Args:
driver: selenium webdriver object (contains profile)
searchTerms: string list of the profiles interests
Raises:
TimeoutException: if the site did not load in time
Exception: if an element/button could not be found
"""
for _searchTerm in searchTerms:
_searchString = "https://new.reddit.com/search?q=" + _searchTerm + "&t=all&sort=new"
try:
driver.get(_searchString)
time.sleep(2)
print ("Page is ready!")
except TimeoutException:
print ("Loading took too much time!")
_subscribeButtons = driver.find_elements_by_xpath("/html/body/div[1]/div/div[2]/div/div/div[1]/div[2]/div/div/div/div[2]/div[2]/div[1]/div/div[2]/div/div[2]/div/div/a/div[3]/button")
print(len(_subscribeButtons))
_randomNumber = random.sample(range(0, 3), 2)
_randomNumber.sort()
for _ran in _randomNumber:
try:
_subscribeButtons[_ran].click()
time.sleep(1)
except Exception:
print ("SubscribeButtons could not be scrolled into view")
_likeButtons = driver.find_elements_by_xpath('/html/body/div[1]/div/div[2]/div/div/div[1]/div/div/div/div/div[2]/div[2]/div[1]/div/div[3]/div[1]/div/div/div/div[1]/div/button[1]')
_lmt = len(_likeButtons)
_smallLmt = int(_lmt/3)
_randomNumber = random.sample(range(0, _lmt), _smallLmt)
_randomNumber.sort()
for i in range (0, _smallLmt):
try:
_likeButtons[_randomNumber[i]].click()
except Exception:
print ("Like could not be scrolled into view")
# -*- coding: utf-8 -*-
"""The purpose of this module is to implement twitterLiker"""
import time
import random
from random import randint
from selenium.common.exceptions import TimeoutException
class twitterLiker():
def twitterLiker(driver, searchTerms):
"""
This method is responsible for automising twitter activity.
Each serch term is being used in the twitter search by adding it to the URL.
On that page the shown users will receive a follow by finding the element by XPATH and clicking it.
To get newer tweets it directs to the "new" page and randomly like 3 shown tweets.
Args:
driver: selenium webdriver object (contains profile)
searchTerms: string list of the profiles interests
Raises:
TimeoutException: if the site did not load in time
Exception: if an element(heart/follow) could not be found
"""
for _searchTerm in searchTerms:
_searchString = "https://twitter.com/search?q=" + _searchTerm + "&src=typd"
try:
driver.get(_searchString)
time.sleep(1)
print("Page is ready!")
except TimeoutException:
print ("Loading took too much time!")
_followButtons = driver.find_elements_by_xpath("/html/body/div[2]/div[2]/div/div[2]/div/div/div[2]/div/div/div/div/div[2]/ol[1]/li[1]/div[2]/div/div/div/div/div[1]/div/div/div/span[2]")
for _btn in _followButtons:
try:
_btn.click()
except Exception:
print ("Already Followed/Follow could not be scrolled into view")
_searchString2 = "https://twitter.com/search?f=tweets&vertical=news&q=" + _searchTerm + "&src=typd"
try:
driver.get(_searchString2)
time.sleep(1)
print ("Page is ready!")
except TimeoutException:
print ("Loading took too much time!")
_heartButtons = driver.find_elements_by_xpath("/html/body/div[2]/div[2]/div/div[2]/div/div/div[2]/div/div/div/div/div[2]/ol[1]/li/div/div[2]/div/div[2]/div[3]/button[1]")
_randomNumber = random.sample(range(0, 9), 3)
_randomNumber.sort()
for i in range (0, 3):
try:
_heartButtons[_randomNumber[i]].click()
except Exception:
print ("Heart could not be scrolled into view")
# -*- coding: utf-8 -*-
"""The purpose of this module is to implement youtubeLiker"""
import time
import random
from random import randint
from selenium.common.exceptions import TimeoutException
class youtubeLiker():
def youtubeLiker(driver, searchTerms):
"""
This method is responsible for automising youtube activity.
Each serch term is being used in the youtube search by adding it to the URL.
On the results page 1 video will be randomly viewed for 45 seconds to assure it being registered as a view.
The video will be liked after 30 seconds
Args:
driver: selenium webdriver object (contains profile)
searchTerms: string list of the profiles interests
Raises:
TimeoutException: if the site did not load in time
Exception: if an element could not be found
"""
for _searchTerm in searchTerms:
_searchString = "https://www.youtube.com/results?search_query=" + _searchTerm
try:
driver.get(_searchString)
print ("Page is ready!")
except TimeoutException:
print ("Loading took too much time!")
_videoButtons = driver.find_elements_by_xpath("/html/body/ytd-app/div[1]/ytd-page-manager/ytd-search/div[1]/ytd-two-column-search-results-renderer/div/ytd-section-list-renderer/div[3]/ytd-item-section-renderer/div[2]/ytd-video-renderer")
try:
_videoButtons[randint(0, len(_videoButtons)-1)].click()
print ("Page is ready!")
except Exception:
print ("Loading took too much time!")
time.sleep(30) #validate view
try:
_likeButton = driver.find_element_by_xpath("/html/body/ytd-app/div[1]/ytd-page-manager/ytd-watch/div[2]/div[2]/div/div[6]/div[2]/ytd-video-primary-info-renderer/div/div/div[3]/div/ytd-menu-renderer/div/ytd-toggle-button-renderer[1]")
#_dislikeButton = driver.find_element_by_xpath("/html/body/ytd-app/div[1]/ytd-page-manager/ytd-watch/div[2]/div[2]/div/div[6]/div[2]/ytd-video-primary-info-renderer/div/div/div[3]/div/ytd-menu-renderer/div/ytd-toggle-button-renderer[2]")
_likeButton.click()
except Exception:
print ("Could not find/click button")
time.sleep(15) #validate view
#Sometimes the ".back"-Method does not work properly if the site has pop-ups, so it will be looped until it works.
loop = True
while loop:
try:
driver.back()
except TimeoutException:
print ("Loading took too much time!")
time.sleep(5)
if "results?search" in driver.current_url:
loop = False
# Installation guide
Be aware that the Firefox Profiles are 700 Mb+, so unless you want to reuse my
profiles, do not include them in the clone directory.
1. Install python3
2. Install requirements.txt using pip3 (should be included in python3) Cmd: pip3 install -r requirements.txt
3. Download geckodriver for Mozilla. Has to be in PATH.
4. Check the config to change the directories and specify your personal settings.
5. If you want to use your own profiles be sure to use "cookieDumper" beforehand.
6. Start "start"
;Specify the location of the cookies and the Firefox profiles
[directories]
cookiedirectory = /pd-thesis-public/cookiefile/
profiledirectory = /mnt/c/Users/UserName/AppData/Roaming/Mozilla/Firefox/Profiles/
scrapeDirectory = /pd-thesis-public/scraperResults
;Names of the Firefox profiles within the directory
[profiles]
Hannes = fz9a3ja0.dev-edition-default
Daniel = p5wq7bed.DanielKraemer
Linda = nxhb41z9.LindaMeier
Hildegard = i0hpiycf.HildegardEvers
Kai = akj173fm.KaiKrefeld
;Specify a proxy for each profile. If you do not want a custom proxy,
;set the first value to False
[proxy]
Hannes = False 46.101.145.206:3128
Daniel = False 90.187.51.41:8080
Linda = False 213.185.81.135:80
Hildegard = False 5.189.163.229:3128
Kai = False 46.101.121.186:1697
;Specify the profiles interests
[interests]
Hannes = Frankfurt Fussball
Daniel = Tennis Dortmund
Linda = Reiten Muenchen
Hildegard = Kochen Nachrichten
Kai = Gucci Rolex
;Specify which profiles should be used in the next run of automisedProfileValidation
[user]
used = Kai Hannes Daniel Linda Hildegard
;Kai Hannes Daniel Linda Hildegard
;Specify which methods should be used in the next run.
;liker: List of liker names separated by a blank space
;scraper: Boolean: Used = True
[methods]
liker = twitterLiker redditLiker facebookLiker youtubeLiker
;twitterLiker redditLiker facebookLiker googleLiker bildLiker youtubeLiker
scraper = True
;scrape: Specify which pages should be visited in the next run.
;repeats: Enter the number of scrape runs
;delay: specify the time between each run
;for the others enter a list of product URLs
[scrape]
pages = cyberport.de esprit.de zalando.de mediamarkt.de amazon.de tchibo.de otto.de
repeats = 8
delay = 1200
;cyberport.de esprit.de zalando.de mediamarkt.de tchibo.de otto.de amazon.de
esprit.de = /maedchenmode/web-kleid-mit-ethno-print-und-pompon-details-RL3027304_403 /sale/damen-sale/pullover-strickjacken/pullover/pullover-aus-baumwolle-mit-punkte-print-038CC1I024_400 /herren-hemden/kurzarmhemd-mit-allover-print-aus-baumwolle-058EE2F001_330 /herren-diverses/diverses/sonnenbrillen/sonnenbrille-im-material-mix-39032S_505 /damenmode/bekleidung/online-exclusive/taillierte-jeans-jacke-im-biker-stil-078EE1G002_921
hm.com = /de/product/07271?article=07271-B
zalando.de = /erima-razor-2-0-teamwear-new-royal-black-white-1er42d039-k11.html /adidas-performance-questar-tnd-laufschuh-neutral-carbonclaquablack-ad541a15f-q11.html /soliver-black-label-blazer-greyblack-soa21g01o-c11.html /wrangler-ringer-tee-t-shirt-print-faded-black-wr122o00w-q11.html /fratelli-rossetti-slipper-black-f1512m004-q11.html
cyberport.de = /markenshops/outlet/restposten-a-ware-/pdp/1h62-020/apple-ipad-2017-wi-fi-32-gb-spacegrau-mp2f2fd-a-.html /markenshops/outlet/restposten-a-ware-/notebook-tablet/pdp/1h54-015/huawei-mediapad-m3-tablet-wifi-32-gb-android-6-0-silber.html /notebook-und-tablet/notebooks/huawei/pdp/1ce3-004/huawei-matebook-x-pro-w19c-notebook-grau-i5-8250u-ssd-3k-gf-mx150-windows-10.html /tv-audio/fernseher/samsung/pdp/7910-9qv/samsung-ue75nu8009-189cm-75-4k-uhd-smart-fernseher.html /pc-und-zubehoer/pc-systeme/alle-pc-systeme/raspberry/pdp/1123-00g/raspberry-pi-3-modell-b-1-gb.html
notebooksbilliger.de = /asus+fx753ve+gc218/incrpc/topprod
mediamarkt.de = /de/product/_sony-ps4-wireless-dualshock-4-redesigned-2187302.html /de/product/_brother-hl-3152cdw-1957620.html /de/product/_playstation-4-1tb-schwarz-fifa-18-2-dualshock4-controller-ps-plus-14-tage-playstation-4-konsolen-2324918.html /de/product/_yamaha-piagerro-np-12wh-2102755.html /de/product/_bosch-wtw875w0-1949527.html
conrad.de = /de/externe-festplatte-89-cm-35-zoll-3-tb-intenso-memory-box-schwarz-usb-30-417397.html?WT.ac=hp_top_technik_sale_product_417397&sc.ref=Homepage
tchibo.de = /v-pullover-p400116370.html /outdoor-jeans-p400100646.html /versilberte-sekt-und-champagnerschale-p400112370.html /xl-ordnungsboxen-set-p400133658.html /ledertasche-p400078186.html
;/funktions-outdoormantel-p400108556.html
otto.de = /p/nike-sportswear-air-max-nostalgic-sneaker-610416362/#variationId=610417633 /p/tom-tailor-denim-slim-fit-jeans-slim-aedan-623897815/#variationId=623905217 /p/studio-coletti-baukastensakko-nizza-338497370/#variationId=338501916 /p/kangaroos-parka-in-leichter-qualitaet-mit-vielen-details-606357911/#variationId=606357912 /p/rosefield-quarzuhr-west-village-wspr-w73-592473167/#variationId=592319868
amazon.de = /dp/B01M3015CT/ref=gw_tabl_Sz_C2_PD18?pf_rd_p=1fa4b2f2-8933-45de-8b78-1526d6fb3822&pf_rd_r=T4JV69Y9JCV28W5TCZE8 /Samsung-Smartphone-Touch-Display-interner-Speicher-midnight-black/dp/B06XJ49G5B/ref=sr_1_1?m=A3JWKAKR8XB7XF&s=ce-de&ie=UTF8&qid=1531828932&sr=1-1&smid=A3JWKAKR8XB7XF https://www.amazon.de/gp/product/B075H1XL8S/ref=s9u_ri_gw_i18?ie=UTF8&pd_rd_i=B075H1XL8S&pd_rd_r=1973da8f-8a80-11e8-879e-d3dccc9dfcbf&pd_rd_w=8S8vS&pd_rd_wg=Yp39O&pf_rd_m=A3JWKAKR8XB7XF&pf_rd_s=&pf_rd_r=YPDARM2Z8EBJWX1KNTNP&pf_rd_t=36701&pf_rd_p=00905ce3-35ba-4371-bc69-ca3f8f7e89cf&pf_rd_i=desktop /FIND-Schuhe-Wallabees-Veloursleder-Kontrastnaht/dp/B06XCJ6Q76/ref=sr_1_5?s=prime-day-secondary&psr=PDAY&ie=UTF8&qid=1531829014&sr=1-5&keywords=schuhe /Siemens-WM14W740-sensoFresh-Waschmaschine-varioPerfect/dp/B015T3OASA
;/Acer-S242HLDBID-Monitor-Reaktionszeit-schwarz/dp/B01AJTVCA8/ref=sr_1_1?ie=UTF8&qid=1531828998&sr=8-1&smid=A3JWKAKR8XB7XF
# -*- coding: utf-8 -*-
"""The purpose of this module is solely to implement priceWebScraper"""
import os
import time
import datetime
import re
from configparser import SafeConfigParser
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
from support import *
class priceWebScraper():
def scrapeLocationSwitch(website):
"""
Implements a dictionary that contains the location of the
price in the html structure of each website. Used in scrapePrices
Args:
website: Website name String (e.g. esprit.de)
Returns: A string tuple containing location data for the scraper
"""
switch = {
'esprit.de': ('span', 'class', 'spv-price__selling'),
'hm.com': ('span', 'class', 'price'), #has severe bot protection
'zalando.de': ('h4', 'class', 'h-text h-color-red title-3 h-p-top-m', "h-text h-color-black title-3 h-p-top-m"),
'cyberport.de': ('div', 'class','price orange', 'price '),
'notebooksbilliger.de': ('text', 'class', 'nbb-svg-base'), #has bot protection
'mediamarkt.de': ('div', 'class', 'price'),
'tchibo.de': ('span', 'itemprop', 'price'),
'conrad.de': ('div', 'itemprop', 'price'), #has bot protection
'amazon.de': ('span', 'id', 'priceblock_ourprice', 'priceblock_dealprice'),
'otto.de': ('span', 'id', 'reducedPriceAmount', 'normalPriceAmount')}
return switch.get(website)
def createFolder(directory, folder):
"""
Creates a folder in a given directory
Args:
directory: Location where the folder should be created
folder: name of the folder
"""
try:
os.chdir(directory)
if not os.path.exists(folder):
os.makedirs(folder)
except OSError:
print ('Error: Creating directory. ' + folder)
def scrapePrices(driverArray):
"""
Consists of three loops:
-The outer loop will iterate through all specified sites
-The middle loop will iterate through all given drivers from automisedProfileValidation
-The inner loop will visit shoppin-urls as specified in the config, uses BeautifulSoup to scrape
the sites context, filters for the price and uses Pandas to build a .csv containing a DataFrame
of prices in relation to the url to analyse it later.
Args:
driverArray: Array of Selenium webdrivers generated by automisedProfileValidation
Returns: saves dataframe.csv file for each specified website to the specified location of scraperResults
"""
#load config
_config = SafeConfigParser()
_config.read('config.ini')
_scrapeSitesArray = _config.get('scrape', 'pages').split()
_scrapeDelay = _config.get('scrape', 'delay')
_scrapeRepeats = _config.get('scrape', 'repeats')
_profileDirectory = _config.get('directories', 'profileDirectory')
_scrapeDirectory = _config.get('directories', 'scrapeDirectory')
_cookieDirectory = _config.get('directories', 'cookieDirectory')
_user = _config.get('user', 'used').split()
for _ in range(int(_scrapeRepeats)):
print("run " + str(_))
_runIdentifier = "run " + str(datetime.datetime.now())[:19].replace(":", ".")
for _scrape in _scrapeSitesArray:
#This datalist will contain datalists of the users scraped product prices
#Keys will be used to map the driverArray to a dictionary
_metaDatalist = []
_keys = []
#for every user a webdriver will be started with the set Firefox profile and proxy from the config
for _driver in driverArray:
#this datalist will contain the scraped prices of one user for each product as an array.
_datalist = []
#each website URL will be called
_url = "https://www." + _scrape
support.loader(_driver, _user[driverArray.index(_driver)], _scrape, _cookieDirectory)
time.sleep(2)
_products = _config.get('scrape', _scrape).split()
for _prod in _products:
try:
if (_prod[0]=="/"):
_driver.get(_url + _prod)
else:
_driver.get(_prod)
except TimeoutException:
print ("Loading took too much time!")
time.sleep(2)
#BeautifulSoup will scrape the Html source page and filter for set words individually for each site
_soup=BeautifulSoup(_driver.page_source, "html.parser")
#get the location of the price within the html structure
_scrapeLocation = priceWebScraper.scrapeLocationSwitch(_scrape)
_location = _scrapeLocation[0]
_type = _scrapeLocation[1]
_price = "00,00"
#if prices can be found by different names, e.g. reduced price is called "price orange"
#while normal price is called "price ", each will be looked for and the most relevant
#will loop at the end to overwrite previous false results
for _name in _scrapeLocation[2:]:
try:
_priceBox = _soup.find(_location, attrs={_type:_name})
#find the price and cut it by iterating to its start
_price = _priceBox.text
_price = "#" + _price #indicator for price starting
_middle = _price.find(",")
_loop = True
i = 0
while(_loop == True):
if (_price[_middle-(i+1)] in ['0','1','2','3','4','5','6','7','8','9','.']):
i = i + 1
else:
_loop = False
_price = _price[_middle-i:_middle+3]
_price = _price.replace("-","00")
_price = _price.replace(".","")
_priceFound = _priceFound + 1
except Exception:
pass #do nothing
if (_price != "00,00"):
print("Price found!")
else: print("No Price Found, default is 00,00")
_datalist.append(_price)
_datetime = str(datetime.datetime.now())
_keys.append(driverArray.index(_driver))
_datalist.insert(0,_scrape)
_datalist.insert(0,_datetime)
_metaDatalist.append(_datalist)
support.dumper(_driver, _user[driverArray.index(_driver)], _scrape, _cookieDirectory)
_datetime = _datetime[:19]
_datetime = _datetime.replace(":", ".")
#Build a DataFrame, transpose it and save it using Pandas
_products.insert(0, "site")
_products.insert(0, "time")
_df = pd.DataFrame(_metaDatalist, columns=_products)
_df = _df.rename(index=dict(zip(_keys, _user)))
_df = _df.T
#priceWebScraper.createFolder(_scrapeDirectory, _runIdentifier)
#os.chdir(_scrapeDirectory + "/" + _runIdentifier)
#_df.to_csv((_datetime + " " +_scrape + '.csv'), sep='\t', encoding='utf-8')