pd.read_html error client remote disconnected

ghz 14hours ago ⋅ 4 views

Yesterday the following code was giving me the dataframe without problem. But today it started giving me the following error:

  http.client.RemoteDisconnected: Remote end closed connection without response

Any solution? I know the problem is from the website but I need some solution that can help me keep using the same code

import datetime
import pandas as pd
import sqlalchemy


from datetime import datetime
from datetime import date, timedelta

#d = datetime.today().strftime('%d-%m-%Y')
data_atual=datetime.now()
mes_atual = data_atual.month
ano_atual= data_atual.year

df = []


for j in range (2020, ano_atual+1):

    for i in range (9,13):

           frame = []


           df1 = pd.read_html('https://www.centrodeinformacao.ren.pt/userControls/GetExcel.aspx?T=REN_MENSAL&P='+str(j)+'&PP='+str(i)+'&PPP=26&PPPP=36&PPPPP=0&variation=PT', decimal=',', thousands=',')[0]
           df2 = pd.read_html('https://www.centrodeinformacao.ren.pt/userControls/GetExcel.aspx?T=REN_MENSAL&P='+str(j)+'&PP='+str(i)+'&PPP=26&PPPP=36&PPPPP=0&variation=PT', decimal=',', thousands=',')[2]


           frame.append(df1)
           frame.append(df2)
           print(frame)
           result = pd.concat(frame)
           print(result)
           engine = sqlalchemy.create_engine('mysql+pymysql://root:1234@localhost:3306/projeto')


           result.to_sql(
              name='renmensal'+str(i)+str(j),  # database table name
              con=engine,
              if_exists='replace',
              index=False
             )

Answer

The error http.client.RemoteDisconnected: Remote end closed connection without response typically indicates that the server is closing the connection unexpectedly, often due to too many requests in a short period or server-side restrictions.

Steps to Address the Issue

  1. Add Delays Between Requests
    To avoid overwhelming the server, introduce a delay between each request.

    import time
    time.sleep(2)  # 2-second delay between requests
    
  2. Use a Session with requests
    A persistent session may help reduce the chance of connection issues by reusing the same connection.

    import requests
    session = requests.Session()
    
  3. Handle Connection Errors Gracefully
    Use a try-except block to catch and retry failed requests.

    import requests
    from requests.exceptions import ConnectionError, HTTPError
    
    for j in range(2020, ano_atual + 1):
        for i in range(9, 13):
            try:
                df1 = pd.read_html(session.get(f'https://www.centrodeinformacao.ren.pt/userControls/GetExcel.aspx?T=REN_MENSAL&P={j}&PP={i}&PPP=26&PPPP=36&PPPPP=0&variation=PT', timeout=10).content, decimal=',', thousands=',')[0]
                df2 = pd.read_html(session.get(f'https://www.centrodeinformacao.ren.pt/userControls/GetExcel.aspx?T=REN_MENSAL&P={j}&PP={i}&PPP=26&PPPP=36&PPPPP=0&variation=PT', timeout=10).content, decimal=',', thousands=',')[2]
            except (ConnectionError, HTTPError) as e:
                print(f"Error fetching data for year {j}, month {i}: {e}")
                continue
    
  4. Respect the Server's Load
    If the server is rate-limited, check its documentation (if available) to ensure you're complying with its request limits.

  5. Retry Logic for Resiliency
    Add retry logic for failed requests.

    import time
    
    def fetch_with_retries(url, retries=3, delay=5):
        for attempt in range(retries):
            try:
                response = session.get(url, timeout=10)
                response.raise_for_status()  # Raise HTTPError for bad responses
                return response
            except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as e:
                print(f"Attempt {attempt + 1} failed: {e}")
                time.sleep(delay)
        raise Exception(f"Failed to fetch data after {retries} attempts")
    
    for j in range(2020, ano_atual + 1):
        for i in range(9, 13):
            try:
                url = f'https://www.centrodeinformacao.ren.pt/userControls/GetExcel.aspx?T=REN_MENSAL&P={j}&PP={i}&PPP=26&PPPP=36&PPPPP=0&variation=PT'
                response = fetch_with_retries(url)
                df1 = pd.read_html(response.content, decimal=',', thousands=',')[0]
                df2 = pd.read_html(response.content, decimal=',', thousands=',')[2]
            except Exception as e:
                print(f"Error fetching data for year {j}, month {i}: {e}")
                continue
    
  6. Alternative: Cache Data Locally
    If possible, download the data manually or implement a caching mechanism to avoid repeatedly querying the server for the same data.

Complete Revised Script

Here’s the fully revised code:

import pandas as pd
import sqlalchemy
import requests
import time
from datetime import datetime

data_atual = datetime.now()
mes_atual = data_atual.month
ano_atual = data_atual.year

session = requests.Session()
df = []

def fetch_with_retries(url, retries=3, delay=5):
    for attempt in range(retries):
        try:
            response = session.get(url, timeout=10)
            response.raise_for_status()
            return response
        except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            time.sleep(delay)
    raise Exception(f"Failed to fetch data after {retries} attempts")

for j in range(2020, ano_atual + 1):
    for i in range(9, 13):
        try:
            url = f'https://www.centrodeinformacao.ren.pt/userControls/GetExcel.aspx?T=REN_MENSAL&P={j}&PP={i}&PPP=26&PPPP=36&PPPPP=0&variation=PT'
            response = fetch_with_retries(url)
            df1 = pd.read_html(response.content, decimal=',', thousands=',')[0]
            df2 = pd.read_html(response.content, decimal=',', thousands=',')[2]
            result = pd.concat([df1, df2])
            engine = sqlalchemy.create_engine('mysql+pymysql://root:1234@localhost:3306/projeto')
            result.to_sql(name=f'renmensal{i}{j}', con=engine, if_exists='replace', index=False)
        except Exception as e:
            print(f"Error processing data for year {j}, month {i}: {e}")

This script introduces retries, session management, delays, and robust error handling to reduce the likelihood of encountering RemoteDisconnected.