Python : IndexError: List Index Out Of Range After Modifying Code
Solution 1:
The only date I see is at the top of the page with the caption "Next Soccer Matches:".
I don't see any point in your initial creation of a dataframe with df = pd.read_html(browser.page_source)[0]
and your subsequent iteration of that dataframe; you should just be iterating directly the tags of the main table. Done correctly you will end up with the correct values in the country and league columns.
I have also changed a few names of variables to more closely reflect what type of value they hold. Also, I have simplified a bit your navigation through the HTML hierarchy recognizing that a element with an id attribute must be unique within the document so you can directly retrieve it by that id and do not have to first retrieve its parent.
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool
import re
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
# print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def generate_matches(table):
tr_tags = table.findAll('tr')
for tr_tag in tr_tags:
if 'class' not in tr_tag.attrs:
continue
tr_class = tr_tag['class']
if 'dark' in tr_class:
th_tag = tr_tag.find('th', {'class': 'first2 tl'})
a_tags = th_tag.findAll('a')
country = a_tags[0].text
league = a_tags[1].text
elif 'deactivate' in tr_class:
td_tags = tr_tag.findAll('td')
yield td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text, \
td_tags[4].text, td_tags[5].text, country, league
def parse_data(url):
browser = create_driver()
browser.get(url)
soup = bs(browser.page_source, "lxml")
div = soup.find('div', {'id': 'col-content'})
table = div.find('table', {'class': 'table-main'})
h1 = soup.find('h1').text
m = re.search(r'\d+ \w+ \d{4}$', h1)
game_date = m[0]
game_data = GameData()
for row in generate_matches(table):
game_data.date.append(game_date)
game_data.time.append(row[0])
game_data.game.append(row[1])
game_data.score.append(row[2])
game_data.home_odds.append(row[3])
game_data.draw_odds.append(row[4])
game_data.away_odds.append(row[5])
game_data.country.append(row[6])
game_data.league.append(row[7])
return game_data
# URLs go here
urls = {
"https://www.oddsportal.com/matches/soccer/20210903/",
}
if __name__ == '__main__':
results = None
# To limit the number of browsers we will use
# (set to a large number if you don't want a limit):
MAX_BROWSERS = 5
pool = ThreadPool(min(MAX_BROWSERS, len(urls)))
for game_data in pool.imap(parse_data, urls):
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
#print(results.head())
# ensure all the drivers are "quitted":
del threadLocal
import gc
gc.collect() # a little extra insurance
Prints:
date time game score home_odds draw_odds away_odds country league
0 03 Sep 2021 00:00 Petrolera - Dep. Pasto 2:3 -128 +245 +334 Colombia Copa Colombia
1 03 Sep 2021 00:00 Jalapa - Export Sebaco 0:2 -137 +266 +307 Nicaragua Liga Primera
2 03 Sep 2021 00:00 Venezuela - Argentina 1:3 +799 +376 -270 World World Cup 2022
3 03 Sep 2021 00:05 Canada - Honduras 1:1 -196 +290 +597 World World Cup 2022
4 03 Sep 2021 01:00 Peru - Uruguay 1:1 +231 +204 +140 World World Cup 2022
.. ... ... ... ... ... ... ... ... ...
219 03 Sep 2021 23:00 Greenville - Toronto FC II 3:0 -147 +263 +363 USA USL League One
220 03 Sep 2021 23:30 Nashville SC - New York City 3:1 +166 +235 +166 USA MLS
221 03 Sep 2021 23:30 Philadelphia Union - New England Revolution 0:1 +164 +256 +154 USA MLS
222 03 Sep 2021 23:30 Louisville City - FC Tulsa 0:1 -233 +394 +459 USA USL Championship
223 03 Sep 2021 23:30 Tampa Bay - Oakland Roots 3:0 -227 +320 +573 USA USL Championship
[224 rows x 9 columns]
Post a Comment for "Python : IndexError: List Index Out Of Range After Modifying Code"