- BeautifulSoup
- Requests
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
import sys, requests from BeautifulSoup import BeautifulSoup # we want UTF8 reload(sys) sys.setdefaultencoding('utf8') #lets define the URLs we will use to scrape url = 'https://www.somepageortheother.nl/bladeren' # let's scrape the first URL and get all available letters of the alphabet print "obtaining a list of all pages to visit" response = requests.get(url) html = response.content soup = BeautifulSoup(html) alphabet = soup.find('ul', attrs={'class': 'letter-list'}) letters = [] for letter in alphabet.findAll('li'): letters.append(letter.text) |
python Webscraping