Use elem.send_keys for handle "Infinite Scroll" in page. Using Selenium PhantomJS in Python -
i need take elements identified xpath in "infinite scroll" web page this. problem when use selenium webdriver phantomjs takes links, first links loaded after page loaded. try increase time.sleep() or insert more of there in code not works. if use firefox webdriver works well.
it's possibile sove problem , improve code? maybe not using time event looking can tell if go down on page or not.
thanks , greetings
import re import mechanize pydblite import base selenium import webdriver import platform import codecs import scrapy import time selenium.webdriver.common.keys import keys class getfrom(object): def scrapying(self): print platform.system() #browser = webdriver.firefox() browser = webdriver.phantomjs(executable_path='/usr/local/bin/node_modules/phantomjs/lib/phantom/bin/phantomjs') browser.get("https://medium.com/top-100/december-2013") time.sleep(5) elem = browser.find_element_by_tag_name("body") no_of_pagedowns = 200 while no_of_pagedowns: elem.send_keys(keys.page_down) time.sleep(0.02) no_of_pagedowns-=1 #qui ci dovrebbe essere lo spider post_elems = browser.find_elements_by_class_name("graf--h2") #fine spider post in post_elems: print post.text browser.quit() myclassobject = getfrom() myclassobject.scrapying()
i modified code this:
import re import mechanize pydblite import base selenium import webdriver import platform import codecs import scrapy import time selenium.webdriver.common.keys import keys class getfrom(object): def scrapying(self): print platform.system() if platform.system()=="windows": browser = webdriver.firefox() else: #browser = webdriver.firefox() browser = webdriver.phantomjs(executable_path='/usr/local/bin/node_modules/phantomjs/lib/phantom/bin/phantomjs') browser.get("https://medium.com/top-100/december-2013") time.sleep(5) elem = browser.find_element_by_tag_name("body") mins = raw_input("for how many minutes want scrapy links? ") print "start: "+ time.asctime( time.localtime(time.time()) ) timeout = time.time() + 60* int(mins) # 5 minutes while true: test = 0 elem.send_keys(keys.page_down) time.sleep(0.5) if test == 5 or time.time() > timeout: print "end: "+ time.asctime( time.localtime(time.time()) ) break test = test - 1 #qui ci dovrebbe essere lo spider post_elems = browser.find_elements_by_class_name("graf--h2") #fine spider post in post_elems: print post.text browser.quit() myclassobject = getfrom() myclassobject.scrapying() now works fine. delete loop add time control handling possibility of gain few results or lots of then. must important thing use javascript function: browser.execute_script("window.scrollto(0, document.body.scrollheight);")
instead of send_keys
Comments
Post a Comment