Skip to content

Commit 756594a

Browse files
Update: [AEA-6053] - document preprocessing (#221)
## Summary - 🤖 Operational or Infrastructure Change - ✨ New Feature ### Details - add preprocessing lambda to convert documents to markdown before KB ingestion - two-stage S3 pipeline: `raw/` -> preprocessing -> `processed/` -> sync - Sonar fix suggestion: replace `/tmp` security issue with secure temp directory handling - cfn Guard suppressions for Lambda permission - cdk-nag suppressions for preprocessing policie - add excel-specific filtering for NHS scal documents - excluding cli.py and magika_shim.py from SonarCloud coverage --------- Co-authored-by: Beenyaa <bencegadanyi1@hotmail.com>
1 parent f3ee6f9 commit 756594a

31 files changed

Lines changed: 1344 additions & 308 deletions

.github/actions/sync_documents/action.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ runs:
3737
shell: bash
3838
run: |
3939
mkdir -p ./s3-content
40-
aws s3 sync s3://${{ steps.find-source-bucket.outputs.BUCKET_NAME }} ./s3-content
40+
aws s3 sync s3://${{ steps.find-source-bucket.outputs.BUCKET_NAME }} ./s3-content --exclude "raw/*" --exclude "processed/*"
4141
4242
- name: Connect to Target Account
4343
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
@@ -65,9 +65,9 @@ runs:
6565
if [ -z "$DIFFS" ]; then
6666
echo -e "\033[0;32m✔ NO DISCREPANCIES FOUND.\033[0m"
6767
else
68-
echo -e "\033[0;33m⚠ WARNING: DISCREPANCIES FOUND:"
68+
echo -e "\033[0;33m⚠ WARNING: DISCREPANCIES FOUND:"
6969
70-
echo "$DIFFS"
70+
echo "$DIFFS"
7171
echo "--------------------------------------------------\033[0m"
7272
7373
CLEAN_DIFFS="${DIFFS//$'\n'/'%0A'}"
@@ -78,5 +78,5 @@ runs:
7878
- name: Upload Files to Target S3
7979
shell: bash
8080
run: |
81-
echo "Updating s3://${{ steps.find-destination-bucket.outputs.BUCKET_NAME }}..."
82-
aws s3 sync ./s3-content s3://${{ steps.find-destination-bucket.outputs.BUCKET_NAME }} --delete
81+
echo "Updating s3://${{ steps.find-destination-bucket.outputs.BUCKET_NAME }}/processed/..."
82+
aws s3 sync ./s3-content s3://${{ steps.find-destination-bucket.outputs.BUCKET_NAME }}/processed/ --delete

.github/workflows/cdk_package_code.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ jobs:
6868
run: |
6969
poetry show --only=slackBotFunction | grep -E "^[a-zA-Z]" | awk '{print $1"=="$2}' > requirements_slackBotFunction
7070
poetry show --only=syncKnowledgeBaseFunction | grep -E "^[a-zA-Z]" | awk '{print $1"=="$2}' > requirements_syncKnowledgeBaseFunction
71+
poetry show --only=preprocessingFunction | grep -E "^[a-zA-Z]" | awk '{print $1"=="$2}' > requirements_preprocessingFunction
7172
poetry show --only=bedrockLoggingConfigFunction | grep -E "^[a-zA-Z]" | awk '{print $1"=="$2}' > requirements_bedrockLoggingConfigFunction
7273
if [ ! -s requirements_slackBotFunction ] || [ "$(grep -c -v '^[[:space:]]*$' requirements_slackBotFunction)" -eq 0 ]; then \
7374
echo "Error: requirements_slackBotFunction is empty or contains only blank lines"; \
@@ -77,12 +78,29 @@ jobs:
7778
echo "Error: requirements_syncKnowledgeBaseFunction is empty or contains only blank lines"; \
7879
exit 1; \
7980
fi
81+
if [ ! -s requirements_preprocessingFunction ] || [ "$(grep -c -v '^[[:space:]]*$' requirements_preprocessingFunction)" -eq 0 ]; then \
82+
echo "Error: requirements_preprocessingFunction is empty or contains only blank lines"; \
83+
exit 1; \
84+
fi
8085
if [ ! -s requirements_bedrockLoggingConfigFunction ] || [ "$(grep -c -v '^[[:space:]]*$' requirements_bedrockLoggingConfigFunction)" -eq 0 ]; then \
8186
echo "Error: requirements_bedrockLoggingConfigFunction is empty or contains only blank lines"; \
8287
exit 1; \
8388
fi
8489
mkdir -p .dependencies/slackBotFunction/python
8590
mkdir -p .dependencies/syncKnowledgeBaseFunction/python
91+
mkdir -p .dependencies/preprocessingFunction/python
92+
pip3 install -r requirements_slackBotFunction -t .dependencies/slackBotFunction/python
93+
pip3 install -r requirements_syncKnowledgeBaseFunction -t .dependencies/syncKnowledgeBaseFunction/python
94+
pip3 install -r requirements_preprocessingFunction -t .dependencies/preprocessingFunction/python
95+
rm -rf .dependencies/preprocessingFunction/python/magika* .dependencies/preprocessingFunction/python/onnxruntime*
96+
cp packages/preprocessingFunction/magika_shim.py .dependencies/preprocessingFunction/python/magika.py
97+
find .dependencies/preprocessingFunction/python -type d -name "tests" -exec rm -rf {} + 2>/dev/null || true
98+
find .dependencies/preprocessingFunction/python -type d -name "test" -exec rm -rf {} + 2>/dev/null || true
99+
find .dependencies/preprocessingFunction/python -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
100+
find .dependencies/preprocessingFunction/python -type d -name "examples" -exec rm -rf {} + 2>/dev/null || true
101+
find .dependencies/preprocessingFunction/python -type f \( -name "*.pyc" -o -name "*.pyo" -o -name "*.so.debug" \) -delete
102+
find .dependencies/preprocessingFunction/python -type f -name "*.md" ! -name "README.md" -delete
103+
find .dependencies/preprocessingFunction/python -name "*.txt" -size +10k -delete
86104
mkdir -p .dependencies/bedrockLoggingConfigFunction/python
87105
pip3 install -r requirements_slackBotFunction -t .dependencies/slackBotFunction/python
88106
pip3 install -r requirements_syncKnowledgeBaseFunction -t .dependencies/syncKnowledgeBaseFunction/python

.github/workflows/pull_request.yml

Lines changed: 96 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,6 @@ env:
88
BRANCH_NAME: ${{ github.event.pull_request.head.ref }}
99

1010
jobs:
11-
dependabot-auto-approve-and-merge:
12-
needs: quality_checks
13-
uses: NHSDigital/eps-common-workflows/.github/workflows/dependabot-auto-approve-and-merge.yml@b933ef1bb3527fd7e7d5a7629fbd4e4dd94bf1b4
14-
secrets:
15-
AUTOMERGE_APP_ID: ${{ secrets.AUTOMERGE_APP_ID }}
16-
AUTOMERGE_PEM: ${{ secrets.AUTOMERGE_PEM }}
1711
get_asdf_version:
1812
runs-on: ubuntu-22.04
1913
outputs:
@@ -32,20 +26,105 @@ jobs:
3226
TAG_FORMAT=$(yq '.TAG_FORMAT' .github/config/settings.yml)
3327
echo "TAG_FORMAT=$TAG_FORMAT" >> "$GITHUB_OUTPUT"
3428
29+
get_commit_message:
30+
runs-on: ubuntu-22.04
31+
outputs:
32+
commit_message: ${{ steps.commit_message.outputs.commit_message }}
33+
steps:
34+
- name: Checkout code
35+
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
36+
with:
37+
ref: ${{ env.BRANCH_NAME }}
38+
fetch-depth: 0
39+
- name: Get Commit message
40+
id: commit_message
41+
run: |
42+
echo "commit_message=$(git show -s --format=%s)" >> "$GITHUB_OUTPUT"
43+
3544
quality_checks:
3645
uses: NHSDigital/eps-common-workflows/.github/workflows/quality-checks.yml@b933ef1bb3527fd7e7d5a7629fbd4e4dd94bf1b4
37-
needs: [get_asdf_version]
46+
needs: [get_asdf_version, get_commit_message]
47+
if: ${{ ! contains(needs.get_commit_message.outputs.commit_message, '#skip-qc') }}
3848
with:
3949
asdfVersion: ${{ needs.get_asdf_version.outputs.asdf_version }}
4050
secrets:
4151
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
4252

53+
quality_gate:
54+
needs: get_commit_message
55+
runs-on: ubuntu-22.04
56+
if: always()
57+
steps:
58+
- name: Wait for quality checks to succeed
59+
if: ${{ ! contains(needs.get_commit_message.outputs.commit_message, '#skip-qc') }}
60+
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd
61+
with:
62+
github-token: ${{ secrets.GITHUB_TOKEN }}
63+
result-encoding: json
64+
script: |
65+
const owner = context.repo.owner;
66+
const repo = context.repo.repo;
67+
const runId = context.runId;
68+
69+
// How many times to poll
70+
const pollTime = 10000; // 10 seconds
71+
const maxRetries = 120; // 20 minutes at 10 seconds each
72+
let attempts = 0;
73+
74+
async function fetchQCJob() {
75+
const { data } = await github.rest.actions.listJobsForWorkflowRun({
76+
owner, repo, run_id: runId
77+
});
78+
return data.jobs.find(job => job.name === 'quality_checks / quality_checks');
79+
}
80+
81+
let qc = await fetchQCJob();
82+
while ((!qc || qc.status !== 'completed') && attempts < maxRetries) {
83+
attempts++;
84+
console.log(`Attempt #${attempts}: ` +
85+
(qc
86+
? `found job "${qc.name}" with status=${qc.status}`
87+
: 'no matching quality_checks job yet'));
88+
await new Promise(r => setTimeout(r, pollTime));
89+
qc = await fetchQCJob();
90+
}
91+
92+
if (!qc) {
93+
core.setFailed(
94+
`Timed out waiting for a "quality_checks" job (after ${attempts} polls).`
95+
);
96+
return;
97+
}
98+
99+
if (qc.status !== 'completed') {
100+
core.setFailed(
101+
`Quality checks job never completed (last status=${qc.status}).`
102+
);
103+
return;
104+
}
105+
106+
if (qc.conclusion !== 'success') {
107+
core.setFailed(
108+
`Quality checks failed (conclusion=${qc.conclusion}).`
109+
);
110+
}
111+
112+
- name: Bypass QC gate
113+
if: ${{ contains(needs.get_commit_message.outputs.commit_message, '#skip-qc') }}
114+
run: echo "Skipping QC gate per commit message."
115+
116+
dependabot-auto-approve-and-merge:
117+
needs: quality_gate
118+
uses: NHSDigital/eps-common-workflows/.github/workflows/dependabot-auto-approve-and-merge.yml@b933ef1bb3527fd7e7d5a7629fbd4e4dd94bf1b4
119+
secrets:
120+
AUTOMERGE_APP_ID: ${{ secrets.AUTOMERGE_APP_ID }}
121+
AUTOMERGE_PEM: ${{ secrets.AUTOMERGE_PEM }}
122+
43123
pr_title_format_check:
44124
uses: NHSDigital/eps-common-workflows/.github/workflows/pr_title_check.yml@b933ef1bb3527fd7e7d5a7629fbd4e4dd94bf1b4
45125

46126
get_issue_number:
47127
runs-on: ubuntu-22.04
48-
needs: quality_checks
49128
outputs:
50129
issue_number: ${{steps.get_issue_number.outputs.result}}
51130

@@ -71,7 +150,11 @@ jobs:
71150
result-encoding: string
72151

73152
package_code:
74-
needs: [get_issue_number]
153+
needs: [get_issue_number, quality_gate]
154+
if: |
155+
always() &&
156+
! contains(needs.*.result, 'failure') &&
157+
! contains(needs.*.result, 'cancelled')
75158
uses: ./.github/workflows/cdk_package_code.yml
76159
with:
77160
STACK_NAME: epsam-pr-${{needs.get_issue_number.outputs.issue_number}}
@@ -80,6 +163,10 @@ jobs:
80163

81164
release_code:
82165
needs: [get_issue_number, package_code]
166+
if: |
167+
always() &&
168+
! contains(needs.*.result, 'failure') &&
169+
! contains(needs.*.result, 'cancelled')
83170
uses: ./.github/workflows/release_all_stacks.yml
84171
with:
85172
STACK_NAME: epsam-pr-${{needs.get_issue_number.outputs.issue_number}}

Makefile

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ lint-flake8:
4848
test:
4949
cd packages/slackBotFunction && PYTHONPATH=. COVERAGE_FILE=coverage/.coverage poetry run python -m pytest
5050
cd packages/syncKnowledgeBaseFunction && PYTHONPATH=. COVERAGE_FILE=coverage/.coverage poetry run python -m pytest
51+
cd packages/preprocessingFunction && PYTHONPATH=. COVERAGE_FILE=coverage/.coverage poetry run python -m pytest
5152
cd packages/bedrockLoggingConfigFunction && PYTHONPATH=. COVERAGE_FILE=coverage/.coverage poetry run python -m pytest
5253

5354
clean:
@@ -108,6 +109,7 @@ cdk-synth: cdk-synth-pr cdk-synth-non-pr
108109
cdk-synth-non-pr:
109110
mkdir -p .dependencies/slackBotFunction
110111
mkdir -p .dependencies/syncKnowledgeBaseFunction
112+
mkdir -p .dependencies/preprocessingFunction
111113
mkdir -p .dependencies/bedrockLoggingConfigFunction
112114
mkdir -p .local_config
113115
STACK_NAME=epsam \
@@ -127,6 +129,7 @@ cdk-synth-non-pr:
127129
cdk-synth-pr:
128130
mkdir -p .dependencies/slackBotFunction
129131
mkdir -p .dependencies/syncKnowledgeBaseFunction
132+
mkdir -p .dependencies/preprocessingFunction
130133
mkdir -p .dependencies/bedrockLoggingConfigFunction
131134
mkdir -p .local_config
132135
STACK_NAME=epsam-pr-123 \
@@ -159,14 +162,14 @@ sync-docs:
159162
./scripts/sync_docs.sh
160163

161164
convert-docs:
162-
poetry run python scripts/convert_docs_to_markdown.py
165+
cd packages/preprocessingFunction && poetry run python -m app.cli
163166

164167
convert-docs-file:
165168
@if [ -z "$$FILE" ]; then \
166169
echo "usage: FILE=your_doc.pdf make convert-docs-file"; \
167170
exit 1; \
168171
fi
169-
poetry run python scripts/convert_docs_to_markdown.py --file "$$FILE"
172+
cd packages/preprocessingFunction && poetry run python -m app.cli --file "$$FILE"
170173

171174

172175
compile:

packages/cdk/assets/s3-folders/processed/.gitkeep

Whitespace-only changes.

packages/cdk/assets/s3-folders/raw/.gitkeep

Whitespace-only changes.

packages/cdk/bin/utils/appUtils.ts

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ const addSuppressions = (resources: Array<CfnResource>, rules: Array<string>): v
5151
resource.cfnOptions.metadata = {}
5252
}
5353
const existing = resource.cfnOptions.metadata.guard?.SuppressedRules || []
54-
const combined = [...new Set([...existing, ...rules])]
54+
const combined = Array.from(new Set([...existing, ...rules]))
5555
resource.cfnOptions.metadata.guard = {SuppressedRules: combined}
5656
})
5757
}
@@ -63,9 +63,10 @@ export const applyCfnGuardSuppressions = (stack: Stack): void => {
6363
// Suppress all cfn-guard checks for all Lambda functions (including implicit CDK-generated ones)
6464
const allLambdas = findResourcesByType(stack, "AWS::Lambda::Function")
6565
addSuppressions(allLambdas, ["LAMBDA_DLQ_CHECK", "LAMBDA_INSIDE_VPC", "LAMBDA_CONCURRENCY_CHECK"])
66-
const permissionResources = findResourcesByPattern(stack, [
67-
"ApiPermission.Test.EpsAssistMeStackApisEpsAssistApiGateway1E1CF19C.POST..slack.events",
68-
"AllowBucketNotificationsToEpsAssistMeStackFunctionsSyncKnowledgeBaseFunctionepsamSyncKnowledgeBaseFunction94D011F3"
69-
])
70-
addSuppressions(permissionResources, ["LAMBDA_FUNCTION_PUBLIC_ACCESS_PROHIBITED"])
66+
67+
const apiGatewayPermissions = findResourcesByPattern(stack, ["ApiPermission"])
68+
addSuppressions(apiGatewayPermissions, ["LAMBDA_FUNCTION_PUBLIC_ACCESS_PROHIBITED"])
69+
70+
const s3NotificationPermissions = findResourcesByPattern(stack, ["AllowBucketNotifications"])
71+
addSuppressions(s3NotificationPermissions, ["LAMBDA_FUNCTION_PUBLIC_ACCESS_PROHIBITED"])
7172
}

packages/cdk/constructs/S3Bucket.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ export class S3Bucket extends Construct {
5858
principals: [props.deploymentRole],
5959
actions: [
6060
"s3:Abort*",
61+
"s3:DeleteObject",
6162
"s3:GetBucket*",
6263
"s3:GetObject*",
6364
"s3:List*",

packages/cdk/constructs/S3LambdaNotification.ts

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import {Function as LambdaFunction} from "aws-cdk-lib/aws-lambda"
66
export interface S3LambdaNotificationProps {
77
bucket: Bucket
88
lambdaFunction: LambdaFunction
9+
prefix?: string
910
}
1011

1112
export class S3LambdaNotification extends Construct {
@@ -18,18 +19,23 @@ export class S3LambdaNotification extends Construct {
1819
const supportedExtensions = [".pdf", ".txt", ".md", ".csv", ".doc", ".docx", ".xls", ".xlsx", ".html", ".json"]
1920

2021
supportedExtensions.forEach(ext => {
22+
const filter: {suffix: string; prefix?: string} = {suffix: ext}
23+
if (props.prefix) {
24+
filter.prefix = props.prefix
25+
}
26+
2127
// Handle all file creation/modification events
2228
props.bucket.addEventNotification(
2329
EventType.OBJECT_CREATED,
2430
lambdaDestination,
25-
{suffix: ext}
31+
filter
2632
)
2733

2834
// Handle all file deletion events
2935
props.bucket.addEventNotification(
3036
EventType.OBJECT_REMOVED,
3137
lambdaDestination,
32-
{suffix: ext}
38+
filter
3339
)
3440
})
3541
}

0 commit comments

Comments
 (0)