Challenge: Scrape PayPal FAQs Using Selenium and XPath

Scrape FAQs of PayPal using Selenium and XPath.

We'll cover the following

Problem statement

Write code below to get all the FAQs (just the questions) from the PayPal Help Center.

Press + to interact
PayPal Help Center
PayPal Help Center

Expected output

A list of JSONs each with:

  • question: The posted question

  • url: The URL of the question

Points to remember:

  • Just scrape the questions and no more.

  • Don't scrape the Home section as it has repeated questions from the other sections.

  • Please include all the questions in all the side topics.

  • Write the code in main.py under the #TODO section.

  • After running the code, it will test the output against the expected output using hidden test cases.

from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementNotInteractableException
from selenium.webdriver.support.wait import WebDriverWait

def _scrape():

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get("https://www.paypal.com/us/cshelp/personal")
    data = []

    try:
        topics =  WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH,
            "//ul/li/a[contains(@href, 'topic')]")))
    except TimeoutException:
        raise TimeoutException("Elements are not loaded")
    i = 0
    while i < len(topics):
        urls = [x.get_attribute("href") for x in topics]
        try:
            driver.execute_script('arguments[0].click()', topics[i])
        except ElementNotInteractableException:
            i+=1
            continue
    
        try:
            questions = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.XPATH,
                "//a[contains(@href, 'article')]")))
        except TimeoutException:
            raise TimeoutException("Elements are not loaded")
        items = [{"question": x.get_attribute("innerHTML"), "url":x.get_attribute("href")} for x in questions]
        data.extend(items)
        i+=1
        try:
            topics =  WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.XPATH,
                "//ul/li/a[contains(@href, 'topic')]")))
        except TimeoutException:
            raise TimeoutException("Elements are not loaded")


    driver.close()
    return data

def test(expected_output, output):
    # Test Case 1 
    expected_questions = set([x['question'] for x in expected_output[:5]])
    output_questions = set([x['question'] for x in output[:5]])
    try:
        assert expected_questions == output_questions
    except:
        print(f"Test Case 1 (scraped Questions) failed. Expected: {expected_questions}, Got: {output_questions}\n")
        return
    # Test Case 2
    expected_urls = set([x['url'] for x in expected_output[:5]])
    output_urls = set([x['url'] for x in output[:5]])
    try: 
        assert expected_urls == output_urls
    except:
        print(f"Test Case 2 (scraped Urls) failed. Expected: {expected_urls}, Got: {output_urls}\n")
        return
    print("All test cases passed!")














Scraping PayPal FAQs using Selenium and XPaths

Get hands-on with 1300+ tech skills courses.