Skip to content Skip to sidebar Skip to footer

How To Scrape Review Data Present In Read More In Flipkart Reviews

I am trying to scrape Flipkart to extract reviews for a product using request and beautifulsoup package.how can take out data present in Read more click event present in those revi

Solution 1:

from selenium import webdriver
from selenium.webdriver.common.by import By
from contextlib import closing
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import urllib2
import re
from bs4 import BeautifulSoup
import unicodedata

defremove_non_ascii_1(text):

    return''.join([i iford(i) < 128else' 'for i in text])

with closing(Firefox()) as browser:
    site = "https://www.flipkart.com/asus-zenfone-2-laser-ze550kl-black-16-gb/product-reviews/itme9j58yzyzqzgc?pid=MOBE9J587QGMXBB7"
    browser.get(site)

    file = open("review.txt", "w")

    for count inrange(1, 10):
        nav_btns = browser.find_elements_by_class_name('_33m_Yg')

        button = ""for btn in nav_btns:
            number = int(btn.text)
            if(number==count):
                button = btn
                break

        button.send_keys(Keys.RETURN)
        WebDriverWait(browser, timeout=10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_2xg6Ul")))

        read_more_btns = browser.find_elements_by_class_name('_1EPkIx')


        for rm in read_more_btns:
            browser.execute_script("return arguments[0].scrollIntoView();", rm)
            browser.execute_script("window.scrollBy(0, -150);")
            rm.click()

        page_source = browser.page_source

        soup = BeautifulSoup(page_source, "lxml")
        ans = soup.find_all("div", class_="_3DCdKt")


        for tag in ans:
            title = unicode(tag.find("p", class_="_2xg6Ul").string).replace(u"\u2018", "'").replace(u"\u2019", "'")
            title = remove_non_ascii_1(title)
            title.encode('ascii','ignore')
            content = tag.find("div", class_="qwjRop").div.prettify().replace(u"\u2018", "'").replace(u"\u2019", "'")
            content = remove_non_ascii_1(content)
            content.encode('ascii','ignore')
            content = content[15:-7]

            votes = tag.find_all("span", class_="_1_BQL8")
            upvotes = int(votes[0].string)
            downvotes = int(votes[1].string)

            file.write("Review Title : %s\n\n" % title )
            file.write("Upvotes : " + str(upvotes) + "\n\nDownvotes : " + str(downvotes) + "\n\n")
            file.write("Review Content :\n%s\n\n\n\n" % content )

    file.close()

Usage:

  1. Install the requirements by running pip install bs4 selenium.
  2. Add geckodriver to the PATH. Follow these instructions.
  3. Put the link of the product in site variable inside the script.
  4. Run the script by running python scrape.py.
  5. Reviews will be saved in the file review.txt.

Solution 2:

Had some issues using @CSMaverick code while accessing the READ MORE link. Modified the code as per my requirement.

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup as bs

defget_source_code(browser):
    rm_btns = browser.find_elements_by_class_name('_1BWGvX')
    for rm_btn in rm_btns:
        rm_btn.click()
    return browser.page_source


defcollect_reviews_attributes(html):
    soup_obj = bs(html, "html.parser")
    text_tag_divs = soup_obj.find_all('div', attrs={"class", "t-ZTKy"})
    heading_tag_divs = soup_obj.find_all('p', attrs={"class", "_2-N8zT"})
    rating_tag_divs = soup_obj.find_all('div', attrs={"class", "_3LWZlK _1BLPMq"})
    text_tags = [tag.text for tag in text_tag_divs]
    heading_tags = [tag.text for tag in heading_tag_divs]
    rating_tags = [tag.text for tag in rating_tag_divs]
    returnlist(zip(heading_tags, text_tags, rating_tags))
    
collector_list = []    
    

browser = webdriver.Firefox(executable_path=r"path to\geckodriver.exe")
url = "https://www.flipkart.com/samsung-253-l-frost-free-double-door-3-star-convertible-refrigerator/product-reviews/itmf75fa1554bad3?pid=RFRFNDEEJ28SNQPG&lid=LSTRFRFNDEEJ28SNQPGEJ3YHJ&sortOrder=MOST_HELPFUL&certifiedBuyer=false&aid=overall"
num_pages = 3# get from the url dynamically or else give large number and try hitting until u get exception

browser.get(url) # open the url in the browserfor _ inrange(num_pages):
    page_source_code = get_source_code(browser)
    collector_list.extend(collect_reviews_attributes(page_source_code))
    next_page = browser.find_elements_by_class_name('_1LKTO3')[-1] # previous and next are under same class. Access last element
    next_page.click()

Post a Comment for "How To Scrape Review Data Present In Read More In Flipkart Reviews"