Clean and overhauled paralelism. (part 1)

This commit is contained in:
Oracle 2025-10-18 18:37:54 +02:00
parent 271bb1f170
commit 429c694629
4 changed files with 220 additions and 100 deletions

View File

@ -25,10 +25,12 @@ class AsyncronConfig(AppConfig):
self.load_model_auxilaries() self.load_model_auxilaries()
self.load_extensions() self.load_extensions()
#Init the asyncron worker for this process #Init and run the asyncron worker singleton for this process if it's not already running.
from .workers import AsyncronWorker from .workers import AsyncronWorker
#The worker should not start working until they know we're responding to requests. if AsyncronWorker.IS_ACTIVE:
AsyncronWorker.init() worker = AsyncronWorker()
worker.start_after_db_ready = True
worker.start( daemon = True )
def import_per_app( self, names ): def import_per_app( self, names ):
for app in apps.get_app_configs(): for app in apps.get_app_configs():

View File

@ -12,27 +12,32 @@ def post_fork( server, worker ): #worker and AsyncronWorker, pay attention!
post_fork.worker = worker post_fork.worker = worker
from .workers import AsyncronWorker from .workers import AsyncronWorker
AsyncronWorker.log = worker.log AsyncronWorker.IS_ACTIVE = True
AsyncronWorker.log.info("Asyncron worker attached.") AsyncronWorker.register_init_callback( _patch )
def _patch( aworker ):
gserver = post_fork.server
gworker = post_fork.worker
if not gworker.reloader: return #So if reload = False
#Attach gunicorn reload event to asyncron_worker exit signals
original_callback = gworker.reloader._callback
def new_callback( *args, **kwargs ):
aworker.stop( reason = "Gunicorn Reload" )
return original_callback( *args, **kwargs )
gworker.reloader._callback = new_callback
aworker.log.setLevel( gworker.log.loglevel )
aworker.log.info( "Attached worker to gunicorn." )
init_to_override = AsyncronWorker.init
def init( *args, **kwargs ):
AsyncronWorker.MAX_COUNT = 1
AsyncronWorker.override_exit_signals()
if worker.reloader: #So if reload = True
to_override = worker.reloader._callback
def new_callback(*args, **kwargs):
AsyncronWorker.stop( reason = "Auto Reload" )
return to_override(*args, **kwargs)
worker.reloader._callback = new_callback
return init_to_override( *args, **kwargs )
AsyncronWorker.init = init
# Keeping the worker in post_fork.worker so we can add extra files in it for it to track # Keeping the worker in post_fork.worker so we can add extra files in it for it to track
# TODO: Currently unfinished, since i just realized using the "inotify" support of gunicorn # LOW PRIORITY TODO: Currently unfinished, since i just realized using the "inotify" support of gunicorn
# makes this reduntant, but still here is the relevant code if I want to also support the simpler # makes this reduntant, but still here is the relevant code if I want to also support the simpler
# polling system # polling system
# Should be in asyncron.app.ready # Should be in asyncron.app.ready

View File

@ -29,15 +29,15 @@ class Command(BaseCommand):
help = 'Start an Asyncorn Worker' help = 'Start an Asyncorn Worker'
def handle( self, *arg, **kwargs ): def handle( self, *arg, **kwargs ):
AsyncronWorker.log = logging.getLogger(__name__) AsyncronWorker.IS_ACTIVE = True
while True: while True:
worker = AsyncronWorker( daemon = False ) worker = AsyncronWorker()
print( "Starting:", worker ) print( "Starting:", worker )
try: try:
worker.start( is_robust = True ) worker.start()
except Exception as e: except Exception as e:
print("Worker Died with an error! Restarting in 10 seconds, traceback:") print("Worker Died with an error! Restarting in 10 seconds, traceback:")
print( traceback.format_exc() ) print( traceback.format_exc() )

View File

@ -1,8 +1,12 @@
from django.db import IntegrityError, models, close_old_connections from django.db import IntegrityError, models, close_old_connections
from django.utils import timezone from django.db.backends import signals as django_signals
from django.db.utils import OperationalError from django.db.utils import OperationalError
from django.utils import timezone
from asgiref.sync import sync_to_async from asgiref.sync import sync_to_async
import os, signal import os, signal
import time import time
import threading import threading
@ -13,11 +17,26 @@ import random
from .utils import retry_on_db_error, ignore_on_db_error from .utils import retry_on_db_error, ignore_on_db_error
class AsyncronWorker: ASYNC_TASKS_CLEANUP_THRESHOLD = 64
INSTANCES = [] #AsyncronWorker instance
MAX_COUNT = 0
EXIST_SIGNALS = [ class AsyncronWorker:
"""
The asyncron worker for a process, limited to one per proccess.
Other threads on the same process, can offload their work to this thread and it's single async event loop.
Or alternativley, This worker can run in non-daemon mode, and use the main thread.
The first case is meant to be used alongside the web server,
And the second case is meant for a standalone (more robust) task proccessor.
"""
IS_ACTIVE = False #Wheter this proccess needs an AsyncronWorker at all, this prevents running the worker on all the things that cause asyncron.app.ready to run!
INSTANCE = None #Singelton
INIT_CALLBACKS = [] #Once the singelton is created, it'll run through the callbacks with itself as the first arg.
THREAD_LOCK = threading.Lock() #for initial singelton creation.
EXIT_SIGNALS = [
signal.SIGABRT, signal.SIGABRT,
signal.SIGHUP, signal.SIGHUP,
signal.SIGQUIT, signal.SIGQUIT,
@ -26,108 +45,201 @@ class AsyncronWorker:
] ]
@classmethod @classmethod
def override_exit_signals( cls ): def register_init_callback( cls, callback ):
for sig in cls.EXIST_SIGNALS: assert not cls.INSTANCE, "Cannot register new callbacks after the worker is created!"
cls.INIT_CALLBACKS.append( callback )
def __new__( cls, *args, **kwargs ):
"""
Thread safe singelton logic
"""
if cls.INSTANCE: return cls.INSTANCE
with cls.THREAD_LOCK:
if cls.INSTANCE: return cls.INSTANCE
cls.INSTANCE = super().__new__( cls, *args, **kwargs )
for callback in cls.INIT_CALLBACKS: callback( cls.INSTANCE )
cls.INSTANCE.log #Evaluating the log property while we have the lock
cls.INSTANCE.register_with_exit_signals()
django_signals.connection_created.connect( cls.INSTANCE.handle_new_db_connection )
cls.INSTANCE.log.debug("Worker created for this process.")
return cls.INSTANCE
@property
def log( self ):
if hasattr( self, '_log' ): return self._log
log = logging.getLogger("asyncron.worker")
handler = logging.StreamHandler()
#Taken from gunicorn, so it looks similar in the merged output
formatter = logging.Formatter(
r"%(asctime)s [%(process)d] [%(levelname)s] [Asyncron] %(message)s",
r"[%Y-%m-%d %H:%M:%S %z]"
)
handler.setFormatter(formatter)
log.addHandler(handler)
log.setLevel(logging.DEBUG)
self._log = log
return log
def register_with_exit_signals( self ):
"""
Hooks this worker into EXIT_SIGNALS, without messing up other handlers downstream.
resulting in self.handle_exit_signal to be called on exit signals.
"""
for sig in self.EXIT_SIGNALS:
to_override = signal.getsignal(sig) to_override = signal.getsignal(sig)
if getattr(to_override, "already_wrapped", False): if hasattr(to_override, "wraps_callable"):
cls.log.warning( self.log.warning(
f"An attempt was made to wrap around the {signal.strsignal(sig)} signal again!" f"An attempt was made to wrap around the {signal.strsignal(sig)} signal again!"
" Make sure you only call asyncron.AsyncronWorker.override_exit_signals once per process." " Make sure you only call asyncron.AsyncronWorker.override_exit_signals once per process."
) )
continue continue
if to_override and callable(to_override): if to_override and callable( to_override ):
def wrapped( signum, frame ): def wrapped( signum, frame ):
cls.sigcatch( signum, frame ) self.handle_exit_signal( signum, frame )
return to_override( signum, frame ) return wrapped.wraps_callable( signum, frame )
wrapped.already_wrapped = True wrapped.wraps_callable = to_override
cls.log.debug(f"Wrapped {to_override} inside sigcatch for {signal.strsignal(sig)}")
self.log.debug(f"Wrapped '{to_override}' inside handle_exit_signal for: {signal.strsignal(sig)}")
signal.signal(sig, wrapped) signal.signal(sig, wrapped)
else: else:
cls.log.debug(f"Direct sigcatch for {signal.strsignal(sig)}") self.log.debug(f"Directly listening for exit signal: {signal.strsignal(sig)}")
signal.signal(sig, cls.sigcatch) signal.signal(sig, self.handle_exit_signal)
@classmethod def handle_exit_signal( self, signum, frame ):
def sigcatch( cls, signum, frame ): self.stop(f"Signal {signal.strsignal(signum)}")
cls.stop(f"Signal {signal.strsignal(signum)}")
@classmethod def handle_new_db_connection( self, sender, **kwargs ):
def stop( cls, reason = None ): if self.is_db_ready: return
cls.log.info(f"[Asyncron] Stopping Worker(s): {reason}") self.log.debug(f"First DB connection: {sender}")
for worker in cls.INSTANCES: from .models import Worker, Task, Trace
if worker.is_stopping: continue
worker.is_stopping = True
worker.loop.call_soon_threadsafe(worker.loop.stop)
for worker in cls.INSTANCES: self.is_db_ready = True
if worker.thread.is_alive(): django_signals.connection_created.disconnect( self.handle_new_db_connection )
worker.thread.join()
@classmethod if not self.loop: return
def init( cls ): self.loop.call_soon_threadsafe( self.is_db_ready_event.set )
if len(cls.INSTANCES) < cls.MAX_COUNT: cls()
#TODO: Use this to skip the 1 second delay in the self.start method on higher traffic servers.
#from django.db.backends.signals import connection_created
#from django.db.backends.postgresql.base import DatabaseWrapper
#from django.dispatch import receiver
#@receiver(connection_created, sender=DatabaseWrapper)
#def initial_connection_to_db(sender, **kwargs):
# if len(cls.INSTANCES) < cls.MAX_COUNT: cls()
def start( self, daemon = False ):
## assert not self.thread, "This Worker has already been started once!"
## Start of instance methods
##
def __init__( self, daemon = True ):
self.INSTANCES.append(self)
self.is_stopping = False
self.clearing_dead_workers = False
self.watching_models = collections.defaultdict( set ) # Model -> Set of key name of the tasks
self.work_loop_over = asyncio.Event()
self.database_unreachable = False
self.all_tasks = []
if daemon: if daemon:
self.thread = threading.Thread( target = self.start ) self.thread = threading.Thread( target = self.start_working )
self.thread.start() self.thread.start()
return
assert threading.main_thread() == threading.current_thread(), f"Cannot run a non daemon worker, in a thread other than main! Current Thread: {threading.current_thread()}"
self.thread = threading.current_thread()
self.start_working( is_robust = True )
def stop( self, reason = None ):
if self.is_stopping: return #TODO: Insisting on exiting faster should probably be managed in the signal handler
self.log.info(f"Stopping Worker: {reason}")
self.is_stopping = True
if not self.loop: return
self.loop.call_soon_threadsafe( self.is_stopping_event.set )
##
## North of here, all methods are to be run in the main thread.
## South of here, all methods except __init__ are potentially in another thread,
## with it's own dedicated async event loop.
##
def __init__( self ):
#Used only inside create_task method
self.tasks_next_cleanup = ASYNC_TASKS_CLEANUP_THRESHOLD
self.tasks_running = []
#These booleans have asyncio.Event counterparts in the loop context
self.is_db_ready = False
self.is_stopping = False
#Just so that the asyncron.apps.ready doens't trigger the django warning
self.start_after_db_ready = False
#Parallelism
self.thread = None #Once the worker starts, it'll be populated
self.loop = None #Once the event loop is ready, it'll be populated
self.clearing_dead_workers = False
self.watching_models = collections.defaultdict( set ) # Model -> Set of key name of the tasks
self.database_unreachable = False
def event_loop_init( self ):
assert not self.loop, "This worker already has a running even loop!"
self.loop = asyncio.new_event_loop()
asyncio.set_event_loop( self.loop )
#These asyncio.Events have booleans counterparts in the __init__ section for main thread non async logic
self.is_db_ready_event = asyncio.Event()
self.is_stopping_event = asyncio.Event()
self.task_reason_jobs_queue = asyncio.Queue() #Run tasks from other threads, safely
def create_task( self, coro, *, silent = False, name = None, context = None ): def create_task( self, coro, *, silent = False, name = None, context = None ):
if not silent: if not silent:
origina_coro = coro async def wrapped():
async def coro(): try:
try: return await origina_coro return await wrapped.coro
except: except KeyboardInterrupt as e:
self.log.warning(f"[Asyncron] Task Error {origina_coro} {name}:\n{traceback.format_exc()}" ) self.log.debug(f"Task {wrapped.coro} {name} Interrupted with {e}." )
raise raise
except:
self.log.warning(f"Task Error {wrapped.coro} {name}:\n{traceback.format_exc()}" )
raise
wrapped.coro = coro
coro = wrapped
self.all_tasks.append( self.loop.create_task( coro(), name = name, context = context ) ) task = self.loop.create_task( coro(), name = name, context = context )
self.tasks_running.append( task )
for task in list(self.all_tasks): if self.tasks_next_cleanup < len(self.tasks_running):
if task.done(): self.all_tasks.remove(task) task_count = len(self.tasks_running)
for t in list(self.tasks_running):
if t.done(): self.tasks_running.remove(t)
self.tasks_next_cleanup = ASYNC_TASKS_CLEANUP_THRESHOLD + len(self.tasks_running)
self.log.debug(
f"Cleaned up {task_count - len(self.tasks_running)} of {len(self.tasks_running)} tasks,"
f" next cleanup at: {self.tasks_next_cleanup}"
)
return task
def start_working( self, is_robust = False ):
self.event_loop_init()
if self.start_after_db_ready:
self.log.debug("Waiting on another module to create the first database connection...")
self.loop.run_until_complete( self.is_db_ready_event.wait() )
def start( self, is_robust = False ):
assert not hasattr(self, "loop"), "This worker is already running!"
from .models import Worker, Task, Trace from .models import Worker, Task, Trace
self.model = Worker( pid = os.getpid(), thread_id = threading.get_ident(), is_robust = is_robust ) self.model = Worker( pid = os.getpid(), thread_id = threading.get_ident(), is_robust = is_robust )
self.loop = asyncio.new_event_loop()
asyncio.set_event_loop( self.loop )
#Run tasks from other threads, safely
self.task_reason_jobs_queue = asyncio.Queue()
self.create_task( self.consume_task_reason_jobs_queue() ) self.create_task( self.consume_task_reason_jobs_queue() )
#Fight over who's gonna be the master, prove your health in the process! #Fight over who's gonna be the master, prove your health in the process!
self.create_task( retry_on_db_error(self.master_loop)() ) self.create_task( retry_on_db_error(self.master_loop)() )
main_task = self.create_task( self.work_loop() ) main_task = self.create_task( self.work_loop() )
time.sleep(0.3) #To avoid the django initialization warning! #time.sleep(0.3) #To avoid the django initialization warning!
self.model.save() self.model.save()
self.model.refresh_from_db() self.model.refresh_from_db()
#Fill in the ID fields of the tasks we didn't dare to check with db until now #Fill in the ID fields of the tasks we didn't dare to check with db until now
from .models import Task from .models import Task
for func in Task.registered_tasks.values(): for func in Task.registered_tasks.values():
@ -139,7 +251,7 @@ class AsyncronWorker:
self.attach_django_signals() self.attach_django_signals()
try: try:
self.loop.run_until_complete( self.work_loop_over.wait() ) #This is the lifetime of this worker self.loop.run_until_complete( self.is_stopping_event.wait() ) #This is the lifetime of this worker
except KeyboardInterrupt: self.log.info(f"[Asyncron][W{self.model.id}] Worker Received KeyboardInterrupt, exiting...") except KeyboardInterrupt: self.log.info(f"[Asyncron][W{self.model.id}] Worker Received KeyboardInterrupt, exiting...")
except RuntimeError: self.log.info(f"[Asyncron][W{self.model.id}] Worker Stopped, exiting...") except RuntimeError: self.log.info(f"[Asyncron][W{self.model.id}] Worker Stopped, exiting...")
else: self.log.info(f"[Asyncron][W{self.model.id}] Worker exiting...") else: self.log.info(f"[Asyncron][W{self.model.id}] Worker exiting...")
@ -169,17 +281,18 @@ class AsyncronWorker:
time.sleep( 0.1 ) time.sleep( 0.1 )
else: break else: break
self.log.debug("Worker stopped working.")
#self.loop.call_soon(self.started.set) #self.loop.call_soon(self.started.set)
def attach_django_signals( self ): def attach_django_signals( self ):
django_signals = { django_name_to_signals = {
name : attr name : attr
for name in ["post_save", "post_delete"] #TO Expand: dir(models.signals) for name in ["post_save", "post_delete"] #TO Expand: dir(models.signals)
if not name.startswith("_") #Dont get private stuff if not name.startswith("_") #Dont get private stuff
and ( attr := getattr(models.signals, name) ) #Just an assignment and ( attr := getattr(models.signals, name) ) #Just an assignment
and isinstance( attr, models.signals.ModelSignal ) #Is a signal related to models! and isinstance( attr, models.signals.ModelSignal ) #Is a signal related to models!
} }
for name, signal in django_signals.items(): for name, signal in django_name_to_signals.items():
signal.connect( functools.partial( self.model_changed, name ) ) signal.connect( functools.partial( self.model_changed, name ) )
from .models import Task from .models import Task
@ -336,24 +449,24 @@ class AsyncronWorker:
await self.check_services() await self.check_services()
await self.check_scheduled() await self.check_scheduled()
except OperationalError as e: except OperationalError as e:
self.log.warning(f"[Asyncron] DB Connection Error: {e}") self.log.warning(f"DB Connection Error: {e}")
self.log.warning(f"[Asyncron] Traceback:\n{traceback.format_exc()}" ) self.log.warning(f"Traceback:\n{traceback.format_exc()}" )
self.check_interval = 60 #break self.check_interval = 60 #break
except Exception as e: except Exception as e:
self.log.warning(f"[Asyncron] check_scheduled failed: {e}") self.log.warning(f"check_scheduled failed: {e}")
self.log.warning(f"[Asyncron] Traceback:\n{traceback.format_exc()}" ) self.log.warning(f"Traceback:\n{traceback.format_exc()}" )
self.check_interval = 20 self.check_interval = 20
try: try:
await sync_to_async( close_old_connections )() await sync_to_async( close_old_connections )()
except Exception as e: except Exception as e:
self.log.warning(f"[Asyncron] close_old_connections failed: {e}") self.log.warning(f"close_old_connections failed: {e}")
self.log.warning(f"[Asyncron] Traceback:\n{traceback.format_exc()}" ) self.log.warning(f"Traceback:\n{traceback.format_exc()}" )
#break #break
self.work_loop_over.set() self.is_stopping_event.set()
async def consume_task_reason_jobs_queue( self ): async def consume_task_reason_jobs_queue( self ):
while True: while True: