OiO.lk Blog python why is the data not being parsed? #machine learning #python
python

why is the data not being parsed? #machine learning #python


I need a lot of columns to analyze the data. I use the cianparser library. The task is to parse the declarations for sale. But, for some reason, exactly what I need ( located in the processed_data list ) absolutely does not want to be parsed! I tried to withdraw the keys: I look, but they are not there, although with_extra_data = True, in theory, should collect what is after if with_extra_data:, but not 🙁

keys:

{'author': 'Абсолют Недвижимость', 'author_type': 'developer', 'url': 'https://www.cian.ru/sale/flat/307207783/', 'location': 'Москва', 'deal_type': 'sale', 'accommodation_type': 'flat', 'floor': 14, 'floors_count': 19, 'rooms_count': 1, 'total_meters': 37.6, 'price_per_month': -1, 'commissions': 0, 'price': 9743081, 'district': '', 'street': '', 'house_number': '', 'underground': '', 'residential_complex': 'Город-парк Первый Московский'}

the code itself (that’s not all, it will be there below):

import csv
import cianparser
import os
import time
import random
import requests
import logging

# Setting up logging
logging.basicConfig(filename="parsing_errors.log", level=logging.ERROR,
                    format="%(asctime)s:%(levelname)s:%(message)s")

def save_data_to_csv(data, file_name, mode="a"):
    if not data:
        return

    fieldnames = data[0].keys()
    file_exists = os.path.isfile(file_name)

    with open(file_name, mode, newline="", encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        if not file_exists or mode == 'w':
            writer.writeheader()
        writer.writerows(data)

def get_random_proxy():
    proxy_list = [
        '117.250.3.58:8080',
        '115.96.208.124:8080',
        '152.67.0.109:80',
        '45.87.68.2:15321',
        '68.178.170.59:80',
        '20.235.104.105:3729',
        '195.201.34.206:80'
    ]
    proxy = random.choice(proxy_list)
    return {"http": f"http://{proxy}", "https": f"http://{proxy}"}

csv_name="moscow_4.csv"
def collect_real_estate_data(locations, deal_type="sale", rooms=(1), start_page=1, end_page=2,
                             file_name=csv_name, with_extra_data=True):
    all_data = []

    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Gecko/20100101 Firefox/85.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15",
    ]

    for location in locations:
        parser = cianparser.CianParser(location=location)
        print(f"Data collection for {location}...")

        for page in range(start_page, end_page + 1):
            user_agent = random.choice(user_agents)
            proxy = get_random_proxy()

            session = requests.Session()
            session.proxies.update(proxy)
            session.headers.update({"User-Agent": user_agent, "Referer": "https://cian.ru"})

            retries = 3  # Number of attempts
            for attempt in range(retries):
                try:
                    data = parser.get_flats(deal_type=deal_type, rooms=rooms,
                                            additional_settings={"start_page": page, "end_page": page})
                    break  # If the data is successfully received, exit the loop
                except requests.exceptions.RequestException as e:
                    print(f"Page request error {page} for {location}: {e}. Attempt {attempt + 1}/{retries}")
                    time.sleep(5 + random.uniform(0, 5))  # Pause before repeating
                    if attempt == retries - 1:
                        logging.error(f"Could not get data from {page}-й page for {location}: {e}")
                        continue

            processed_data = []
            for flat in data:
                processed_flat = {
                    "author": flat.get("author", "Not specified"),  
                    "author_type": flat.get("author_type", "Not specified"),  
                    "url": flat.get("url", "Not specified"),
                    "price": flat.get("price", "Not specified"),  
                    "location": flat.get("location", "Not specified"),  
                    "district": flat.get("district", "Not specified"),  
                    "street": flat.get("street", "Not specified"),  
                    "underground": flat.get("underground", "Not specified"),  
                    "total_meters": flat.get("total_meters", "Not specified"),  
                    "rooms_count": flat.get("rooms_count", "Not specified"),  
                    "floor": flat.get("floor", "Not specified"),  
                    "floors_count": flat.get("floors_count", "Not specified"),  
                    "residential_complex": flat.get("residential_complex", "Not specified"),  

                }

                if with_extra_data:
                    processed_flat["object_type"] = flat.get("object_type", "Not specified")  # Type of housing (secondary/new building)
                    processed_flat["class"] = flat.get("class", "Not specified")
                    processed_flat["finish_type"] = flat.get("finish_type", "Not specified")  
                    processed_flat["heating_type"] = flat.get("heating_type", "Not specified")
                    processed_flat["house_material_type"] = flat.get("house_material_type", "Not specified")  
                    processed_flat["metro_foot_minute"] = flat.get("metro_foot_minute", "Not specified")  
                    processed_flat["living_meters"] = flat.get("living_meters", "Not specified")  
                    processed_flat["kitchen_meters"] = flat.get("kitchen_meters", "Not specified")  
                    processed_flat["ceiling_height"] = flat.get("ceiling_height", "Not specified")  
                    processed_flat["year_construction"] = flat.get("year_construction", "Not specified")  
                    processed_flat["min_house_year"] = flat.get("min_house_year", "Not specified")  
                    processed_flat["parking_type"] = flat.get("parking_type", "Not specified")  
                    processed_flat["builder"] = flat.get("builder", "Not specified")  
                    processed_flat["have_loggia"] = flat.get("have_loggia", "Not specified")  
                    processed_flat["min_balconies"] = flat.get("min_balconies", "Not specified")  
                    processed_flat["max_house_year"] = flat.get("max_house_year", "Not specified")  
                print(flat)
                processed_data.append(processed_flat)

            all_data.extend(processed_data)

            if len(all_data) >= 100:
                save_data_to_csv(all_data[:100], file_name)
                all_data = all_data[100:]

            time.sleep(random.uniform(2, 4))  # Increasing the pause

        if all_data:
            save_data_to_csv(all_data, file_name)
            all_data = []

        print(f"Data for {location} successfully collected and saved to a file {file_name}")

locations = ["Москва"]

collect_real_estate_data(locations=locations, deal_type="sale", rooms=(1), start_page=1, end_page=50,
                                 file_name=csv_name, with_extra_data=True)

this was the first version of the code, but the second, in which if you just put with_extra_data=True, it perfectly parses some of the data (but not all) that is missing. But at the same time, object does not care how much it goes through the declarations, it simply writes -1, that is, there is no value

keys:

{'author': 'VESPER', 'author_type': 'developer', 'url': 'https://www.cian.ru/sale/flat/307211222/', 'location': 'Москва', 'deal_type': 'sale', 'accommodation_type': 'flat', 'floor': 8, 'floors_count': 9, 'rooms_count': 1, 'total_meters': 57.6, 'price': 117936000, 'year_of_construction': '2023', <b>'object_type': -1</b>, 'house_material_type': 'Монолитный', 'heating_type': -1, 'finish_type': 'Без отделки, чистовая с мебелью', 'living_meters': -1, 'kitchen_meters': -1, 'phone': '+74951387184', 'district': 'Тверской', 'street': '1-я Тверская-Ямская ', 'house_number': '2', 'underground': 'Маяковская', 'residential_complex': 'Vesper Tverskaya'}

well, the second version of the code itself:

import cianparser

def print_all_keys(data):
    for flat in data:
        print(flat)

# Creating an instance of the parser
parser = cianparser.CianParser(location="Москва")

# get data
data = parser.get_flats(deal_type="sale", rooms=(1), additional_settings={"start_page": 1, "end_page": 1}, with_extra_data=True, with_saving_csv=True)

# print keys
print_all_keys(data)

Maybe there are problems with the connection? Either it doesn’t go deeper into the declaration, or I’m somehow using the signs and methods themselves incorrectly

and that’s what (unfortunately) I get from the first code in the form of a table

table from first code



You need to sign in to view this answers

Exit mobile version