Challenges: Scrape Top Indices Data from Yahoo Finance

Utilize Selenium to scrape the history of important market indices from Yahoo.

We'll cover the following

Problem statement

The task is to write a code to get the historical data (first 100 rows) of the top three major indices from Yahoo World-Indices News.

Press + to interact
World indices
World indices
Press + to interact
Sample history table for S&P index
Sample history table for S&P index

Expected output

A list of JSONs, each with:

  • {Date Open Close} extracted from the table as shown on the screen above.

Points to remember:

  • Just scrape the first 50 rows and no more.

  • Write your code in main.py under the #todo section.

  • After running the code, it will test the output against the expected output using hidden test cases.

from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementNotInteractableException
from selenium.webdriver.support.wait import WebDriverWait

def _scrape():  
  driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
  driver.get("https://finance.yahoo.com/world-indices/")
  data = []
  top_3 = driver.find_elements(By.CSS_SELECTOR,
                                  "tr > td:nth-child(1) > span > div > a")
  top_3_links = [x.get_attribute("href") for x in top_3[:3]]
  
  for link in top_3_links:
      driver.get(link+"/history")
      rows = []
      while len(rows) < 100:
          rows=driver.find_elements(By.CSS_SELECTOR, "table > tbody > tr")
          driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
      for row in rows[:100]:
          d = {"date": row.find_element(By.CSS_SELECTOR,'td:nth-child(1)').text,
                  "open": row.find_element(By.CSS_SELECTOR,'td:nth-child(2)').text,
                  "close": row.find_element(By.CSS_SELECTOR,'td:nth-child(5)').text}
          data.append(d)
  driver.close()
  return data

def test(expected_output, output):
    # Test Case 1 
    expected_dates = set([x['date'] for x in expected_output])
    output_dates = set([x['date'] for x in output])
    if expected_dates == output_dates:
        print("Test case 1 succeeded")
    else:
        print(f"Test Case 1 (scraped dates) failed. Expected: {expected_dates}, Got: {output_dates}")
        return

    # Test Case 2
    expected_open = set([x['open'] for x in expected_output])
    output_open = set([x['open'] for x in output])
    if expected_open == output_open:
        print("Test case 2 succeeded")
    else:
        print(f"Test Case 2 (scraped opens) failed. Expected: {expected_open}, Got: {output_open}")
        return

    # Test Case 3
    expected_close = set([x['close'] for x in expected_output])
    output_close = set([x['close'] for x in output])
    if expected_close == output_close:
        print("Test case 3 succeeded")
    else:
        print(f"Test Case 3 (scraped closes) failed. Expected: {expected_close}, Got: {output_close}")
        return

    print("All test cases passed!")














Scraping market indecies historical data using Selenium

Get hands-on with 1300+ tech skills courses.