Pyton skripta

Pyton skripta

offline
  • Pridružio: 18 Jul 2003
  • Poruke: 4204
  • Gde živiš: U zlatnom kavezu

Imam payton skriptu koj ne radi sasvim ok, a ja ne mogu da joj nadjem gresku. Ona treba da generise slektovane recenzije hotela u csv fajl i ona do uradi, ali je problem sto bez obzira na broj selekcija ona obradi samo pet. Ako neko zna neka pomogne prilazem kod

  1. # !/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. # importing libraries
  4. from bs4 import BeautifulSoup
  5. import urllib, csv, os, datetime, urllib.request, re, sys
  6.  
  7. # creating CSV file to be used
  8.  
  9. file = open(os.path.expanduser(r"~/Desktop/TripAdviser Reviews.csv"), "wb")
  10. file.write(
  11.     b"Organization" + b"," + b"Address" + b"," + b"Reviewer" + b"," + b"Review Title" + b"," + b"Review" + b"," + b"Review Count" + b"," + b"Help Count"
  12.      + b"," + b"Attraction Count" + b"," + b"Restaurant Count" + b"," + b"Hotel Count" + b"," +  b"Rating Date" + b"," + b"Rating" + b"\n")
  13.  
  14. # List the first page of the reviews (ends with "#REVIEWS") - separate the websites with ,
  15. WebSites = [
  16.     "https://www.tripadvisor.com/Hotel_Review-g294472-d7181993-Reviews-Holiday_Inn_Express_Belgrade_City-Belgrade.html"
  17. ]
  18. Checker = "REVIEWS"
  19. # looping through each site until it hits a break
  20. for theurl in WebSites:
  21.     thepage = urllib.request.urlopen(theurl)
  22.     soup = BeautifulSoup(thepage, 'html.parser')
  23.     while True:
  24.         # extract the help count, restaurant review count, attraction review count and hotel review count
  25.         a = b = 0
  26.         helpcountarray = restaurantarray = attractionarray = hotelarray = ""
  27.  
  28.         for profile in soup.findAll(attrs={"class": "memberBadgingNoText"}):
  29.             textFromHt = profile.findAll(text=True)
  30.             image = '\n'.join(textFromHt)
  31.             image = image.replace("\n", "|||||").strip()
  32.             # print "".join(profile.findAll(text=True))
  33.  
  34.             if image.find("helpful") > 0:
  35.                 counter = image.split("helpful", 1)[0].split("|", 1)[1][-4:].replace("|", "").strip()
  36.                 if len(helpcountarray) == 0:
  37.                     helpcountarray = [counter]
  38.                 else:
  39.                     helpcountarray.append(counter)
  40.             elif image.find("helpful") < 0:
  41.                 if len(helpcountarray) == 0:
  42.                     helpcountarray = ["0"]
  43.                 else:
  44.                     helpcountarray.append("0")
  45.  
  46.             if image.find("attraction") > 0:
  47.                 counter = image.split("attraction", 1)[0].split("|", 1)[1][-4:].replace("|", "").strip()
  48.                 if len(attractionarray) == 0:
  49.                     attractionarray = [counter]
  50.                 else:
  51.                     attractionarray.append(counter)
  52.             elif image.find("attraction") < 0:
  53.                 if len(attractionarray) == 0:
  54.                     attractionarray = ["0"]
  55.                 else:
  56.                     attractionarray.append("0")
  57.  
  58.             if image.find("REVIEWS_RESTAURANTS") > 0:
  59.                 counter = image.split("REVIEWS_RESTAURANTS", 1)[0].split("|", 1)[1][-4:].replace("|", "").strip()
  60.                 if len(restaurantarray) == 0:
  61.                     restaurantarray = [0]
  62.                 else:
  63.                     restaurantarray.append("0")
  64.             elif image.find("REVIEWS_RESTAURANTS") < 0:
  65.                 if len(restaurantarray) == 0:
  66.                     restaurantarray = ["0"]
  67.                 else:
  68.                     restaurantarray.append("0")
  69.  
  70.             if image.find("REVIEWS_HOTELS") > 0:
  71.                 counter = image.split("REVIEWS_HOTELS", 1)[0].split("|", 1)[1][-4:].replace("|", "").strip()
  72.                 if len(hotelarray) == 0:
  73.                     hotelarray = [counter]
  74.                 else:
  75.                     hotelarray.append(counter)
  76.             elif image.find("REVIEWS_HOTELS") < 0:
  77.                 if len(hotelarray) == 0:
  78.                     hotelarray = ["0"]
  79.                 else:
  80.                     hotelarray.append("0")
  81.  
  82.                     # extract the rating count for each user review
  83.         #altarray = ""
  84.         #for rating in soup.findAll(attrs={"class": "rating reviewItemInline"}):
  85.          #   alt = rating.find('img', alt=True)
  86.          #   if alt[-5:] == '':
  87.           #      if len(altarray) == 0:
  88.           #          altarray = [alt]
  89.        # else:
  90.             # noinspection PyUnboundLocalVariable
  91.         #    altarray.append(alt)
  92.  
  93.         Organization = soup.find(attrs={'class': 'heading_title'
  94.                                         }).text.replace('"', ' ').replace('Review of', ' ').strip()
  95.         Address = soup.findAll(attrs={'class': 'address_search'
  96.                                       })[0].text.replace(',', '').replace('\n', ''
  97.                                                                           ).strip()
  98.  
  99.         # Loop through each review on the page
  100.         for x in range(0, len(hotelarray)):
  101.             # noinspection PyBroadException
  102.             try:
  103.                 Reviewer = soup.findAll(attrs={"class": "username mo"})[x].text
  104.             except:
  105.                 Reviewer = "N/A"
  106.                 continue
  107.  
  108.             Reviewer = Reviewer.replace(',', ' ').replace('”', '').replace('“', '').replace('"', '').strip()
  109.             ReviewTitle = soup.findAll(attrs={"class": "quote"})[x].text.replace(',', ' ').replace('”', '').replace('“',
  110.                                                                                                                     '').replace(
  111.                 '"', '').replace('é', 'e').strip()
  112.             Review = soup.findAll(attrs={"class": "entry"})[x].text.replace(',', ' ').replace('\n', ' ').strip()
  113.             RatingDate = soup.findAll(attrs={"class": "ratingDate"})[x].text.replace('Reviewed', ' ').replace('NEW',
  114.                                                                                                               ' ').replace(
  115.                 ',', ' ').strip()
  116.             # Rating = altarray[x][:1]
  117.             HelpCount = helpcountarray[x]
  118.             AttractionCount = attractionarray[x]
  119.             Restaurant = restaurantarray[x]
  120.             Hotel = hotelarray[x]
  121.  
  122.             Record = Organization + "," + Address + "," + Reviewer + "," + ReviewTitle + "," + Review + "," + "," + HelpCount + "," + AttractionCount + "," + Restaurant + "," + Hotel + ","  + RatingDate + ","
  123.             if Checker == "REVIEWS":
  124.                 file.write(bytes(Record, encoding="ascii", errors='ignore') + b"\n")
  125.  
  126.         link = soup.find_all(attrs={"class": "nav next taLnk"})
  127.         print(Organization)
  128.         if len(link) == 0:
  129.             break
  130.         else:
  131.             soup = BeautifulSoup(urllib.request.urlopen('http://www.tripadvisor.com'
  132.                                                         + link[0].get('href')), 'html.parser')
  133.             print(link[0].get('href'))
  134.             Checker = link[0].get('href')[-7:]
  135.  
  136. file.close()



Registruj se da bi učestvovao u diskusiji. Registrovanim korisnicima se NE prikazuju reklame unutar poruka.
Ko je trenutno na forumu
 

Ukupno su 1185 korisnika na forumu :: 120 registrovanih, 11 sakrivenih i 1054 gosta   ::   [ Administrator ] [ Supermoderator ] [ Moderator ] :: Detaljnije

Najviše korisnika na forumu ikad bilo je 3466 - dana 01 Jun 2021 17:07

Korisnici koji su trenutno na forumu:
Korisnici trenutno na forumu: 04bokibole, acov34, Adaminho1985, advokat84, airsuba, aleksandarbl, Apok, armor, Asparagus, Avalon015, babaroga, Belac91, bestguarder, bigfoot, Bivan, Bobrock1, bojank, Bole72, Boris90, Borski1977, BORUTUS, BOXRR, BradaRS, BWG, CikaKURE, Coabelgrade, Coficab, Crazzer, d.arsenal321, damirZR, debeli, Denaya, dexsilni, dexteroza, drgrozozo, dzada, EXIT78, Feller, gomago, GORDI, Gorilo_1991, Haris, HrcAk47, ikan, janezek67, kaisarevic1, kalens021, kendzo-andzo-boni-fju, kenny74, komsija1, kybonacci, Levi, lima, ljiljak, Logic005, lucko1, M-HOBBY, Macalone, Marko Marković, Maruti, max power, mercedesamg, Mercury, Metanoja, Miki01, Miki281, mikrimaus, milanpb, milenko crazy north, Milo97, milos.cbr, MiroslavD, Mskok, Nemanja Opalić, Nemanja.M, Niske, opt1, Pale2025, pedjolino76, pein, Polemarchoi, Povratak1912, PrincipL, proka89, Prometeus, RajkoB, redstar011, RJ, Romibrat, Romuluss, rovac, royst33, ruma, Rusmir, sabros, sale76, sap, Savantije, savuni, sekretar, sickmouse, StepskiVuk, strn, tecataki, Tribal, Trivo, vathra, vidra1, virked, Vlada1389, Vlado82, voja64, Vojin, vuksa72, yrraf, zauzet, zlaya011, Zorge, Žrnov, šumar bk2