Better logging, less crashed

This commit is contained in:
Oracle 2025-03-13 19:56:45 +01:00
parent c6ecc71517
commit 7dfe29b10d
4 changed files with 60 additions and 20 deletions

View File

@ -4,7 +4,7 @@
# #
## ##
import logging import traceback, logging
import asyncio import asyncio
import time import time
@ -31,9 +31,24 @@ class Command(BaseCommand):
def handle( self, *arg, **kwargs ): def handle( self, *arg, **kwargs ):
AsyncronWorker.log = logging.getLogger(__name__) AsyncronWorker.log = logging.getLogger(__name__)
worker = AsyncronWorker( daemon = False ) while True:
print( "Starting:", worker )
worker.start( is_robust = True ) worker = AsyncronWorker( daemon = False )
print( "Starting:", worker )
try:
worker.start( is_robust = True )
except Exception as e:
print("Worker Died with an error! Restarting in 10 seconds, traceback:")
print( traceback.format_exc() )
#TODO: worker.cleanup_tasks()
#worker.INSTANCES.remove( worker )
#del worker
time.sleep( 10 )
else: break

View File

@ -203,15 +203,14 @@ class Trace( BaseModel ):
except Exception as e: except Exception as e:
self.set_status( "E", f"Exception: {e}" ) self.set_status( "E", f"Exception: {e}" )
self.stderr = traceback.format_exc() self.stderr = traceback.format_exc()
else: else:
self.set_status( "C" ) self.set_status( "C" )
self.returned = output self.returned = output
finally: self.commit_on_new_print_task.cancel()
self.commit_on_new_print_task.cancel() self.last_end_datetime = timezone.now()
self.last_end_datetime = timezone.now() await self.asave()
await self.asave()
### Runtime methods for self aware tasks ### Runtime methods for self aware tasks

View File

@ -8,3 +8,26 @@ def rgetattr(obj, attr, *args):
def _getattr(obj, attr): def _getattr(obj, attr):
return getattr(obj, attr, *args) return getattr(obj, attr, *args)
return functools.reduce(_getattr, [obj] + attr.split('.')) return functools.reduce(_getattr, [obj] + attr.split('.'))
#Django keeps giving: exception=OperationalError('the connection is closed')
from django.db.utils import OperationalError
def retry_on_db_error( f ):
async def decorator( *args, **kwargs ):
while True:
try:
return await f( *args, **kwargs )
except OperationalError as e:
await asyncio.sleep( 1 )
return decorator
def ignore_on_db_error( f ):
async def decorator( *args, **kwargs ):
while True:
try:
return await f( *args, **kwargs )
except OperationalError as e:
return
return decorator

View File

@ -11,6 +11,8 @@ import asyncio
import collections, functools import collections, functools
import random import random
from .utils import retry_on_db_error, ignore_on_db_error
class AsyncronWorker: class AsyncronWorker:
INSTANCES = [] #AsyncronWorker instance INSTANCES = [] #AsyncronWorker instance
MAX_COUNT = 0 MAX_COUNT = 0
@ -89,7 +91,6 @@ class AsyncronWorker:
self.thread = threading.Thread( target = self.start ) self.thread = threading.Thread( target = self.start )
self.thread.start() self.thread.start()
def start( self, is_robust = False ): def start( self, is_robust = False ):
assert not hasattr(self, "loop"), "This worker is already running!" assert not hasattr(self, "loop"), "This worker is already running!"
from .models import Worker, Task, Trace from .models import Worker, Task, Trace
@ -99,7 +100,7 @@ class AsyncronWorker:
asyncio.set_event_loop( self.loop ) asyncio.set_event_loop( self.loop )
#Fight over who's gonna be the master, prove your health in the process! #Fight over who's gonna be the master, prove your health in the process!
self.loop.create_task( self.master_loop() ) self.loop.create_task( retry_on_db_error(self.master_loop)() )
main_task = self.loop.create_task( self.work_loop() ) main_task = self.loop.create_task( self.work_loop() )
time.sleep(0.3) #To avoid the django initialization warning! time.sleep(0.3) #To avoid the django initialization warning!
@ -124,10 +125,12 @@ class AsyncronWorker:
self.loop.run_until_complete( self.graceful_shutdown() ) self.loop.run_until_complete( self.graceful_shutdown() )
count = Trace.objects.filter( status__in = "SWRP", worker_lock = self.model ).update( try:
status_reason = "Worker died during execution", Trace.objects.filter( status__in = "SWRP", worker_lock = self.model ).update(
status = "A", worker_lock = None status_reason = "Worker died during execution",
) status = "A", worker_lock = None
)
except: pass
#if count: print(f"Had to cancel {count} task(s).") #cls.log.warning #if count: print(f"Had to cancel {count} task(s).") #cls.log.warning
@ -221,7 +224,7 @@ class AsyncronWorker:
current_master = True current_master = True
if not self.clearing_dead_workers: if not self.clearing_dead_workers:
self.loop.create_task( self.clear_dead_workers() ) self.loop.create_task( ignore_on_db_error(self.clear_dead_workers)() )
await self.sync_tasks() await self.sync_tasks()
await self.clear_orphaned_traces() await self.clear_orphaned_traces()
@ -281,19 +284,19 @@ class AsyncronWorker:
await self.check_scheduled() await self.check_scheduled()
except OperationalError as e: except OperationalError as e:
self.log.warning(f"[Asyncron] DB Connection Error: {e}") self.log.warning(f"[Asyncron] DB Connection Error: {e}")
print( traceback.format_exc() ) self.log.warning(f"[Asyncron] Traceback:\n{traceback.format_exc()}" )
self.check_interval = 60 #break self.check_interval = 60 #break
except Exception as e: except Exception as e:
self.log.warning(f"[Asyncron] check_scheduled failed: {e}") self.log.warning(f"[Asyncron] check_scheduled failed: {e}")
print( traceback.format_exc() ) self.log.warning(f"[Asyncron] Traceback:\n{traceback.format_exc()}" )
self.check_interval = 20 self.check_interval = 20
try: try:
await sync_to_async( close_old_connections )() await sync_to_async( close_old_connections )()
except Exception as e: except Exception as e:
self.log.warning(f"[Asyncron] close_old_connections failed: {e}") self.log.warning(f"[Asyncron] close_old_connections failed: {e}")
print( traceback.format_exc() ) self.log.warning(f"[Asyncron] Traceback:\n{traceback.format_exc()}" )
#break #break
@ -330,7 +333,7 @@ class AsyncronWorker:
except IntegrityError: count = 0 except IntegrityError: count = 0
if not count: continue #Lost the race condition to another worker. if not count: continue #Lost the race condition to another worker.
self.loop.create_task( self.start_trace_on_time( trace ) ) self.loop.create_task( ignore_on_db_error(self.start_trace_on_time)( trace ) )
async def start_trace_on_time( self, trace ): async def start_trace_on_time( self, trace ):
from .models import Trace from .models import Trace