python - Running multiple web crawlers at the same time in Django -
i created custom management command django called crawl.py
i make run 2 crawlers @ same time. crawlers each object crawl() function runs infinite loop , interacts orm.
def crawl(self): current_page = self.start_page while true: page_response = requests.get('http://magdeleine.co/browse/page/{}/'.format(current_page)) page_soup = beautifulsoup(page_response.text) image_links = [link["href"] link in page_soup.find_all('a', {'class': 'photo-link'})] image_link in image_links: response = requests.get(image_link) image_page_soup = beautifulsoup(response.text) print('getting image source link') image_source_link = image_page_soup.find('a',{'class': 'download'})['href'] #get tags print('getting tags') ul = image_page_soup.find('ul', {'class': 'tags'}) tag_links = ul.find_all('a', {'rel':'tag'}) tag_names = [tag_link.string tag_link in tag_links] try: tag_names.remove('editor\'s pick') except: pass if not image.objects.filter(url=image_source_link).exists(): image = image(url=image_source_link, origin="mg") print('creating thumbnail') image.create_thumb() image.save() # or create new tag every element in list tag_name in tag_names: tag, created = tag.objects.get_or_create(name=tag_name) image.tags.add(tag) current_page+=1 print("end page") in management command
class command(basecommand): def handle(self, *args, **options): pexel_crawler = pexelcrawler() pexel_crawler.crawl() magdeleine_crawler = magdeleinecrawler() magdeleine_crawler.crawl() i run both .crawl() commands together. if elaborate how done in production vs development , optimal solutions this.
you should able gevent, like:
import gevent django.core.management.base import basecommand class command(basecommand): def handle(self, *args, **options): pexel_crawler = pexelcrawler() magdeleine_crawler = magdeleinecrawler() pexel_job = gevent.spawn(pexel_crawler.crawl) magdeleine_job = gevent.spawn(magdeleine_crawler.crawl) gevent.joinall([pexel_job, magdeleine_job]) i believe work, , keep management command running in foreground long both crawlers running. careful though, because if works expected, infinite loop , never stop.
Comments
Post a Comment