fix: storage.py & classes.py

mannubaveja007 · mannubaveja007 · commit efb4f13dc91b · 2026-04-21T00:30:31.000+05:30
diff --git a/api_app/analyzers_manager/classes.py b/api_app/analyzers_manager/classes.py
@@ -241,17 +241,9 @@ def before_run(self):
 
     def after_run(self):
         super().after_run()
-        # We delete the file only if we have single copy for analyzer
-        # and the file has been saved locally.
-        # Otherwise we would remove the single file that we have on the server
-        if not settings.LOCAL_STORAGE and self.filepath is not None:
-            import os
-
-            try:
-                os.remove(self.filepath)
-            except OSError:
-                logger.warning(f"Filepath {self.filepath} does not exists")
-
+        # When using S3 storage, cached files are now stored in a shared
+        # directory and reused by all analyzers, so we must NOT delete them
+        # here — another analyzer may still be reading the same file.
         logger.info(f"FINISHED analyzer: {self.__repr__()} -> File: ({self.filename}, md5: {self.md5})")
 
 
diff --git a/intel_owl/settings/storage.py b/intel_owl/settings/storage.py
@@ -33,25 +33,26 @@ def retrieve(file, analyzer):
     from storages.backends.s3boto3 import S3Boto3Storage
 
     class S3Boto3StorageWrapper(S3Boto3Storage):
-        def retrieve(self, file, analyzer):
-            # FIXME we can optimize this a lot.
-            #  Right now we are doing an http request FOR analyzer. We can have a
-            #  proxy that will store the content and then save it locally
+        # Shared cache directory where files are downloaded once and
+        # reused by every analyzer that needs them.
+        _CACHE_DIR = os.path.join(MEDIA_ROOT, "_s3_cache")
 
-            # The idea is to download the file in MEDIA_ROOT/analyzer/namefile
-            # if it does not exist
-            path_dir = os.path.join(MEDIA_ROOT, analyzer)
+        def retrieve(self, file, analyzer):
             name = file.name
-            _path = os.path.join(path_dir, name)
+            _path = os.path.join(self._CACHE_DIR, name)
             if not os.path.exists(_path):
-                os.makedirs(path_dir, exist_ok=True)
+                os.makedirs(os.path.dirname(_path), exist_ok=True)
                 if not self.exists(name):
                     raise AssertionError
+                # Write to a temp file first, then rename for atomicity.
+                # This prevents a concurrent worker from reading a half-written file.
+                tmp_path = _path + ".tmp"
                 with self.open(name) as s3_file_object:
                     content = s3_file_object.read()
-                    s3_file_object.seek(0)
-                    with open(_path, "wb") as local_file_object:
+                    with open(tmp_path, "wb") as local_file_object:
                         local_file_object.write(content)
+                # atomic on the same filesystem
+                os.replace(tmp_path, _path)
             return _path
 
     DEFAULT_FILE_STORAGE = "intel_owl.settings.S3Boto3StorageWrapper"