I need a lot of columns to analyze the data. I use the cianparser library. The task is to parse the declarations for sale. But, for some reason, exactly what I need ( located in the processed_data list ) absolutely does not want to be parsed! I tried to withdraw the keys: I look, but they are not there, although with_extra_data = True, in theory, should collect what is after if with_extra_data:, but not 🙁
keys:
{'author': 'Абсолют Недвижимость', 'author_type': 'developer', 'url': 'https://www.cian.ru/sale/flat/307207783/', 'location': 'Москва', 'deal_type': 'sale', 'accommodation_type': 'flat', 'floor': 14, 'floors_count': 19, 'rooms_count': 1, 'total_meters': 37.6, 'price_per_month': -1, 'commissions': 0, 'price': 9743081, 'district': '', 'street': '', 'house_number': '', 'underground': '', 'residential_complex': 'Город-парк Первый Московский'}
the code itself (that’s not all, it will be there below):
import csv
import cianparser
import os
import time
import random
import requests
import logging
# Setting up logging
logging.basicConfig(filename="parsing_errors.log", level=logging.ERROR,
format="%(asctime)s:%(levelname)s:%(message)s")
def save_data_to_csv(data, file_name, mode="a"):
if not data:
return
fieldnames = data[0].keys()
file_exists = os.path.isfile(file_name)
with open(file_name, mode, newline="", encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if not file_exists or mode == 'w':
writer.writeheader()
writer.writerows(data)
def get_random_proxy():
proxy_list = [
'117.250.3.58:8080',
'115.96.208.124:8080',
'152.67.0.109:80',
'45.87.68.2:15321',
'68.178.170.59:80',
'20.235.104.105:3729',
'195.201.34.206:80'
]
proxy = random.choice(proxy_list)
return {"http": f"http://{proxy}", "https": f"http://{proxy}"}
csv_name="moscow_4.csv"
def collect_real_estate_data(locations, deal_type="sale", rooms=(1), start_page=1, end_page=2,
file_name=csv_name, with_extra_data=True):
all_data = []
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) Gecko/20100101 Firefox/85.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15",
]
for location in locations:
parser = cianparser.CianParser(location=location)
print(f"Data collection for {location}...")
for page in range(start_page, end_page + 1):
user_agent = random.choice(user_agents)
proxy = get_random_proxy()
session = requests.Session()
session.proxies.update(proxy)
session.headers.update({"User-Agent": user_agent, "Referer": "https://cian.ru"})
retries = 3 # Number of attempts
for attempt in range(retries):
try:
data = parser.get_flats(deal_type=deal_type, rooms=rooms,
additional_settings={"start_page": page, "end_page": page})
break # If the data is successfully received, exit the loop
except requests.exceptions.RequestException as e:
print(f"Page request error {page} for {location}: {e}. Attempt {attempt + 1}/{retries}")
time.sleep(5 + random.uniform(0, 5)) # Pause before repeating
if attempt == retries - 1:
logging.error(f"Could not get data from {page}-й page for {location}: {e}")
continue
processed_data = []
for flat in data:
processed_flat = {
"author": flat.get("author", "Not specified"),
"author_type": flat.get("author_type", "Not specified"),
"url": flat.get("url", "Not specified"),
"price": flat.get("price", "Not specified"),
"location": flat.get("location", "Not specified"),
"district": flat.get("district", "Not specified"),
"street": flat.get("street", "Not specified"),
"underground": flat.get("underground", "Not specified"),
"total_meters": flat.get("total_meters", "Not specified"),
"rooms_count": flat.get("rooms_count", "Not specified"),
"floor": flat.get("floor", "Not specified"),
"floors_count": flat.get("floors_count", "Not specified"),
"residential_complex": flat.get("residential_complex", "Not specified"),
}
if with_extra_data:
processed_flat["object_type"] = flat.get("object_type", "Not specified") # Type of housing (secondary/new building)
processed_flat["class"] = flat.get("class", "Not specified")
processed_flat["finish_type"] = flat.get("finish_type", "Not specified")
processed_flat["heating_type"] = flat.get("heating_type", "Not specified")
processed_flat["house_material_type"] = flat.get("house_material_type", "Not specified")
processed_flat["metro_foot_minute"] = flat.get("metro_foot_minute", "Not specified")
processed_flat["living_meters"] = flat.get("living_meters", "Not specified")
processed_flat["kitchen_meters"] = flat.get("kitchen_meters", "Not specified")
processed_flat["ceiling_height"] = flat.get("ceiling_height", "Not specified")
processed_flat["year_construction"] = flat.get("year_construction", "Not specified")
processed_flat["min_house_year"] = flat.get("min_house_year", "Not specified")
processed_flat["parking_type"] = flat.get("parking_type", "Not specified")
processed_flat["builder"] = flat.get("builder", "Not specified")
processed_flat["have_loggia"] = flat.get("have_loggia", "Not specified")
processed_flat["min_balconies"] = flat.get("min_balconies", "Not specified")
processed_flat["max_house_year"] = flat.get("max_house_year", "Not specified")
print(flat)
processed_data.append(processed_flat)
all_data.extend(processed_data)
if len(all_data) >= 100:
save_data_to_csv(all_data[:100], file_name)
all_data = all_data[100:]
time.sleep(random.uniform(2, 4)) # Increasing the pause
if all_data:
save_data_to_csv(all_data, file_name)
all_data = []
print(f"Data for {location} successfully collected and saved to a file {file_name}")
locations = ["Москва"]
collect_real_estate_data(locations=locations, deal_type="sale", rooms=(1), start_page=1, end_page=50,
file_name=csv_name, with_extra_data=True)
this was the first version of the code, but the second, in which if you just put with_extra_data=True, it perfectly parses some of the data (but not all) that is missing. But at the same time, object does not care how much it goes through the declarations, it simply writes -1, that is, there is no value
keys:
{'author': 'VESPER', 'author_type': 'developer', 'url': 'https://www.cian.ru/sale/flat/307211222/', 'location': 'Москва', 'deal_type': 'sale', 'accommodation_type': 'flat', 'floor': 8, 'floors_count': 9, 'rooms_count': 1, 'total_meters': 57.6, 'price': 117936000, 'year_of_construction': '2023', <b>'object_type': -1</b>, 'house_material_type': 'Монолитный', 'heating_type': -1, 'finish_type': 'Без отделки, чистовая с мебелью', 'living_meters': -1, 'kitchen_meters': -1, 'phone': '+74951387184', 'district': 'Тверской', 'street': '1-я Тверская-Ямская ', 'house_number': '2', 'underground': 'Маяковская', 'residential_complex': 'Vesper Tverskaya'}
well, the second version of the code itself:
import cianparser
def print_all_keys(data):
for flat in data:
print(flat)
# Creating an instance of the parser
parser = cianparser.CianParser(location="Москва")
# get data
data = parser.get_flats(deal_type="sale", rooms=(1), additional_settings={"start_page": 1, "end_page": 1}, with_extra_data=True, with_saving_csv=True)
# print keys
print_all_keys(data)
Maybe there are problems with the connection? Either it doesn’t go deeper into the declaration, or I’m somehow using the signs and methods themselves incorrectly
and that’s what (unfortunately) I get from the first code in the form of a table
You need to sign in to view this answers
Leave feedback about this