offline
- Brksi
- Ex KGB officer
- Pridružio: 18 Jul 2003
- Poruke: 4204
- Gde živiš: U zlatnom kavezu
|
Imam payton skriptu koj ne radi sasvim ok, a ja ne mogu da joj nadjem gresku. Ona treba da generise slektovane recenzije hotela u csv fajl i ona do uradi, ali je problem sto bez obzira na broj selekcija ona obradi samo pet. Ako neko zna neka pomogne prilazem kod
# !/usr/bin/python
# -*- coding: utf-8 -*-
# importing libraries
from bs4 import BeautifulSoup
import urllib, csv, os, datetime, urllib.request, re, sys
# creating CSV file to be used
file = open(os.path.expanduser(r"~/Desktop/TripAdviser Reviews.csv"), "wb")
file.write(
b"Organization" + b"," + b"Address" + b"," + b"Reviewer" + b"," + b"Review Title" + b"," + b"Review" + b"," + b"Review Count" + b"," + b"Help Count"
+ b"," + b"Attraction Count" + b"," + b"Restaurant Count" + b"," + b"Hotel Count" + b"," + b"Rating Date" + b"," + b"Rating" + b"\n")
# List the first page of the reviews (ends with "#REVIEWS") - separate the websites with ,
WebSites = [
"https://www.tripadvisor.com/Hotel_Review-g294472-d7181993-Reviews-Holiday_Inn_Express_Belgrade_City-Belgrade.html"
]
Checker = "REVIEWS"
# looping through each site until it hits a break
for theurl in WebSites:
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, 'html.parser')
while True:
# extract the help count, restaurant review count, attraction review count and hotel review count
a = b = 0
helpcountarray = restaurantarray = attractionarray = hotelarray = ""
for profile in soup.findAll(attrs={"class": "memberBadgingNoText"}):
textFromHt = profile.findAll(text=True)
image = '\n'.join(textFromHt)
image = image.replace("\n", "|||||").strip()
# print "".join(profile.findAll(text=True))
if image.find("helpful") > 0:
counter = image.split("helpful", 1)[0].split("|", 1)[1][-4:].replace("|", "").strip()
if len(helpcountarray) == 0:
helpcountarray = [counter]
else:
helpcountarray.append(counter)
elif image.find("helpful") < 0:
if len(helpcountarray) == 0:
helpcountarray = ["0"]
else:
helpcountarray.append("0")
if image.find("attraction") > 0:
counter = image.split("attraction", 1)[0].split("|", 1)[1][-4:].replace("|", "").strip()
if len(attractionarray) == 0:
attractionarray = [counter]
else:
attractionarray.append(counter)
elif image.find("attraction") < 0:
if len(attractionarray) == 0:
attractionarray = ["0"]
else:
attractionarray.append("0")
if image.find("REVIEWS_RESTAURANTS") > 0:
counter = image.split("REVIEWS_RESTAURANTS", 1)[0].split("|", 1)[1][-4:].replace("|", "").strip()
if len(restaurantarray) == 0:
restaurantarray = [0]
else:
restaurantarray.append("0")
elif image.find("REVIEWS_RESTAURANTS") < 0:
if len(restaurantarray) == 0:
restaurantarray = ["0"]
else:
restaurantarray.append("0")
if image.find("REVIEWS_HOTELS") > 0:
counter = image.split("REVIEWS_HOTELS", 1)[0].split("|", 1)[1][-4:].replace("|", "").strip()
if len(hotelarray) == 0:
hotelarray = [counter]
else:
hotelarray.append(counter)
elif image.find("REVIEWS_HOTELS") < 0:
if len(hotelarray) == 0:
hotelarray = ["0"]
else:
hotelarray.append("0")
# extract the rating count for each user review
#altarray = ""
#for rating in soup.findAll(attrs={"class": "rating reviewItemInline"}):
# alt = rating.find('img', alt=True)
# if alt[-5:] == '':
# if len(altarray) == 0:
# altarray = [alt]
# else:
# noinspection PyUnboundLocalVariable
# altarray.append(alt)
Organization = soup.find(attrs={'class': 'heading_title'
}).text.replace('"', ' ').replace('Review of', ' ').strip()
Address = soup.findAll(attrs={'class': 'address_search'
})[0].text.replace(',', '').replace('\n', ''
).strip()
# Loop through each review on the page
for x in range(0, len(hotelarray)):
# noinspection PyBroadException
try:
Reviewer = soup.findAll(attrs={"class": "username mo"})[x].text
except:
Reviewer = "N/A"
continue
Reviewer = Reviewer.replace(',', ' ').replace('”', '').replace('“', '').replace('"', '').strip()
ReviewTitle = soup.findAll(attrs={"class": "quote"})[x].text.replace(',', ' ').replace('”', '').replace('“',
'').replace(
'"', '').replace('é', 'e').strip()
Review = soup.findAll(attrs={"class": "entry"})[x].text.replace(',', ' ').replace('\n', ' ').strip()
RatingDate = soup.findAll(attrs={"class": "ratingDate"})[x].text.replace('Reviewed', ' ').replace('NEW',
' ').replace(
',', ' ').strip()
# Rating = altarray[x][:1]
HelpCount = helpcountarray[x]
AttractionCount = attractionarray[x]
Restaurant = restaurantarray[x]
Hotel = hotelarray[x]
Record = Organization + "," + Address + "," + Reviewer + "," + ReviewTitle + "," + Review + "," + "," + HelpCount + "," + AttractionCount + "," + Restaurant + "," + Hotel + "," + RatingDate + ","
if Checker == "REVIEWS":
file.write(bytes(Record, encoding="ascii", errors='ignore') + b"\n")
link = soup.find_all(attrs={"class": "nav next taLnk"})
print(Organization)
if len(link) == 0:
break
else:
soup = BeautifulSoup(urllib.request.urlopen('http://www.tripadvisor.com'
+ link[0].get('href')), 'html.parser')
print(link[0].get('href'))
Checker = link[0].get('href')[-7:]
file.close()
|