Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 3 additions & 11 deletions api_app/analyzers_manager/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from typing import Dict, Tuple

import requests
from django.conf import settings

Check failure on line 12 in api_app/analyzers_manager/classes.py

View workflow job for this annotation

GitHub Actions / linters

Ruff (F401)

api_app/analyzers_manager/classes.py:12:25: F401 `django.conf.settings` imported but unused

from api_app.decorators import classproperty
from certego_saas.apps.user.models import User
Expand Down Expand Up @@ -241,17 +241,9 @@

def after_run(self):
super().after_run()
# We delete the file only if we have single copy for analyzer
# and the file has been saved locally.
# Otherwise we would remove the single file that we have on the server
if not settings.LOCAL_STORAGE and self.filepath is not None:
import os

try:
os.remove(self.filepath)
except OSError:
logger.warning(f"Filepath {self.filepath} does not exists")

# When using S3 storage, cached files are now stored in a shared
# directory and reused by all analyzers, so we must NOT delete them
# here — another analyzer may still be reading the same file.
logger.info(f"FINISHED analyzer: {self.__repr__()} -> File: ({self.filename}, md5: {self.md5})")


Expand Down
23 changes: 12 additions & 11 deletions intel_owl/settings/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,25 +33,26 @@ def retrieve(file, analyzer):
from storages.backends.s3boto3 import S3Boto3Storage

class S3Boto3StorageWrapper(S3Boto3Storage):
def retrieve(self, file, analyzer):
# FIXME we can optimize this a lot.
# Right now we are doing an http request FOR analyzer. We can have a
# proxy that will store the content and then save it locally
# Shared cache directory where files are downloaded once and
# reused by every analyzer that needs them.
_CACHE_DIR = os.path.join(MEDIA_ROOT, "_s3_cache")

# The idea is to download the file in MEDIA_ROOT/analyzer/namefile
# if it does not exist
path_dir = os.path.join(MEDIA_ROOT, analyzer)
def retrieve(self, file, analyzer):
name = file.name
_path = os.path.join(path_dir, name)
_path = os.path.join(self._CACHE_DIR, name)
if not os.path.exists(_path):
os.makedirs(path_dir, exist_ok=True)
os.makedirs(os.path.dirname(_path), exist_ok=True)
Comment thread
mannubaveja007 marked this conversation as resolved.
if not self.exists(name):
raise AssertionError
Comment thread
mannubaveja007 marked this conversation as resolved.
# Write to a temp file first, then rename for atomicity.
# This prevents a concurrent worker from reading a half-written file.
tmp_path = _path + ".tmp"
with self.open(name) as s3_file_object:
content = s3_file_object.read()
s3_file_object.seek(0)
with open(_path, "wb") as local_file_object:
with open(tmp_path, "wb") as local_file_object:
local_file_object.write(content)
Comment on lines 43 to 53
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current check-then-write sequence (if not exists then download then open(_path, "wb")) is not safe under concurrent analyzers: multiple workers can download/write simultaneously, and another worker can observe the file exists while it is only partially written. Use an atomic write pattern (write to a temp file and os.replace/rename), and ideally some form of inter-process locking or exclusive creation to ensure only one downloader populates the cache.

Copilot uses AI. Check for mistakes.
Comment on lines 50 to 53
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

content = s3_file_object.read() reads the entire sample into memory before writing it to disk. For large samples this can cause high memory pressure across concurrent analyzers. Prefer streaming the S3 object to the local file in chunks (e.g., copyfileobj / iter_chunks) while still keeping the atomic-write behavior.

Copilot uses AI. Check for mistakes.
# atomic on the same filesystem
os.replace(tmp_path, _path)
return _path

DEFAULT_FILE_STORAGE = "intel_owl.settings.S3Boto3StorageWrapper"
Expand Down
Loading