Skip to content

Commit 3955d3b

Browse files
Refactor GIAS commands into helper function
This will make it easier to create a scheduled Sidekiq job for this task. This also adds more comprehensive testing for these processes. Jira-Issue: MAV-2788
1 parent 17284c8 commit 3955d3b

6 files changed

Lines changed: 514 additions & 296 deletions

File tree

app/lib/gias.rb

Lines changed: 320 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,320 @@
1+
# frozen_string_literal: true
2+
3+
module GIAS
4+
class << self
5+
def download(output_file:)
6+
# 1. Go to https://get-information-schools.service.gov.uk/Downloads
7+
# 2. Check "Establishment fields CSV"
8+
# 3. Check "Establishment links CSV"
9+
# 4. Submit
10+
# 5. Download the zip file
11+
# 6. Move the downloaded file to db/data/dfe-schools.zip
12+
13+
require "mechanize"
14+
15+
agent = Mechanize.new
16+
agent.user_agent_alias = "Mac Safari"
17+
18+
page =
19+
agent.get("https://get-information-schools.service.gov.uk/Downloads")
20+
form = page.form_with(action: "/Downloads/Collate")
21+
form.checkbox_with(id: "establishment-fields-csv-checkbox").check
22+
form.checkbox_with(id: "establishment-links-csv-checkbox").check
23+
download_page = form.submit
24+
25+
wait_time = 0
26+
until (
27+
download_form =
28+
download_page.form_with(action: "/Downloads/Download/Extract")
29+
) || wait_time > 60
30+
sleep(2)
31+
wait_time += 2
32+
download_page = agent.get(download_page.uri)
33+
end
34+
35+
if download_form
36+
download_button = download_form.button_with(value: "Results.zip")
37+
download_file = agent.click(download_button)
38+
download_file.save!(output_file)
39+
true
40+
else
41+
false
42+
end
43+
end
44+
45+
def import(input_file:, progress_bar: nil)
46+
open_csv(input_file) do |rows|
47+
batch_size = 1000
48+
schools = []
49+
50+
rows.each do |row|
51+
gias_establishment_number = row["EstablishmentNumber"]
52+
next if gias_establishment_number.blank?
53+
54+
schools << Location.new(
55+
type: :school,
56+
urn: row["URN"],
57+
gias_local_authority_code: row["LA (code)"],
58+
gias_establishment_number:,
59+
gias_phase: Integer(row["PhaseOfEducation (code)"]),
60+
gias_year_groups: process_year_groups(row),
61+
name: row["EstablishmentName"],
62+
address_line_1: row["Street"],
63+
address_line_2: [
64+
row["Locality"],
65+
row["Address3"]
66+
].compact_blank.join(", "),
67+
address_town: row["Town"],
68+
address_postcode: row["Postcode"],
69+
status: Integer(row["EstablishmentStatus (code)"]),
70+
url: process_url(row["SchoolWebsite"].presence)
71+
)
72+
73+
if schools.size >= batch_size
74+
import_schools(schools)
75+
update_sites(schools)
76+
schools.clear
77+
end
78+
79+
progress_bar&.increment
80+
end
81+
82+
unless schools.empty?
83+
import_schools(schools)
84+
update_sites(schools)
85+
end
86+
end
87+
end
88+
89+
def check_import(input_file:, progress_bar: nil)
90+
schools_with_future_sessions = {
91+
existing:
92+
Set.new(
93+
Location
94+
.school
95+
.joins(:sessions)
96+
.merge(Session.scheduled)
97+
.pluck(:urn)
98+
),
99+
closed: {
100+
},
101+
closing: {
102+
},
103+
year_group_changes: {
104+
}
105+
}
106+
schools_without_future_sessions = {
107+
closed: {
108+
},
109+
closing: {
110+
},
111+
year_group_changes: {
112+
}
113+
}
114+
115+
existing_schools = Set.new(Location.school.pluck(:urn))
116+
team_schools =
117+
Set.new(
118+
TeamLocation
119+
.joins(:location)
120+
.merge(Location.school)
121+
.pluck(:"locations.urn")
122+
)
123+
124+
new_schools = Set.new
125+
126+
Zip::File.open(input_file) do |zip|
127+
links_csv = zip.glob("links_edubasealldata*.csv").first
128+
links_csv_content = links_csv.get_input_stream.read
129+
130+
successors = {}
131+
CSV.parse(
132+
links_csv_content,
133+
headers: true,
134+
encoding: "ISO-8859-1:UTF-8"
135+
) do |row|
136+
next unless row["LinkType"]&.include?("Successor")
137+
138+
successors[row["URN"]] ||= []
139+
successors[row["URN"]] << row["LinkURN"]
140+
end
141+
142+
school_data_csv = zip.glob("edubasealldata*.csv").first
143+
school_csv_content = school_data_csv.get_input_stream.read
144+
145+
CSV.parse(
146+
school_csv_content,
147+
headers: true,
148+
encoding: "ISO-8859-1:UTF-8"
149+
) do |row|
150+
gias_establishment_number = row["EstablishmentNumber"]
151+
next if gias_establishment_number.blank?
152+
153+
urn = row["URN"]
154+
new_status = row["EstablishmentStatus (name)"]
155+
156+
if urn.in?(schools_with_future_sessions[:existing])
157+
check_for_school_closure(
158+
row,
159+
schools_with_future_sessions,
160+
successors
161+
)
162+
check_for_year_group_changes(
163+
row,
164+
schools_with_future_sessions,
165+
existing_schools
166+
)
167+
elsif urn.in?(team_schools)
168+
check_for_school_closure(
169+
row,
170+
schools_without_future_sessions,
171+
successors
172+
)
173+
check_for_year_group_changes(
174+
row,
175+
schools_without_future_sessions,
176+
existing_schools
177+
)
178+
elsif !urn.in?(existing_schools) &&
179+
new_status.in?(["Open", "Open, but proposed to close"])
180+
new_schools << urn
181+
end
182+
ensure
183+
progress_bar&.increment
184+
end
185+
end
186+
187+
{
188+
new_schools:,
189+
schools_with_future_sessions:,
190+
schools_without_future_sessions:
191+
}
192+
end
193+
194+
def process_url(url)
195+
return nil if url.blank?
196+
197+
# Legh Vale school has a URL of http:www.leghvale.st-helens.sch.uk
198+
# which is not a valid URL.
199+
url = url.gsub("http:www", "http://www")
200+
201+
# Some school URLs don't start with http:// and https://
202+
url.start_with?("http://", "https://") ? url : "https://#{url}"
203+
end
204+
205+
def process_year_groups(row)
206+
low_year_group = row["StatutoryLowAge"].to_i - 4
207+
high_year_group = row["StatutoryHighAge"].to_i - 5
208+
(low_year_group..high_year_group).to_a
209+
end
210+
211+
def row_count(input_file)
212+
Zip::File.open(input_file) do |zip|
213+
csv_entry = zip.glob("edubasealldata*.csv").first
214+
csv_entry.get_input_stream.read.lines.count
215+
end
216+
end
217+
218+
private
219+
220+
def open_csv(input_file)
221+
Zip::File.open(input_file) do |zip|
222+
csv_entry = zip.glob("edubasealldata*.csv").first
223+
csv_content = csv_entry.get_input_stream.read
224+
rows =
225+
CSV.parse(csv_content, headers: true, encoding: "ISO-8859-1:UTF-8")
226+
yield rows
227+
end
228+
end
229+
230+
def import_schools(schools)
231+
Location.import!(
232+
schools,
233+
on_duplicate_key_update: {
234+
conflict_target: %i[urn],
235+
index_predicate: "site IS NULL",
236+
columns: %i[
237+
address_line_1
238+
address_line_2
239+
address_postcode
240+
address_town
241+
gias_establishment_number
242+
gias_local_authority_code
243+
gias_phase
244+
gias_year_groups
245+
name
246+
status
247+
url
248+
]
249+
}
250+
)
251+
end
252+
253+
def update_sites(schools)
254+
schools_by_urn = schools.index_by(&:urn)
255+
256+
sites =
257+
Location
258+
.where(urn: schools_by_urn.keys)
259+
.where.not(site: nil)
260+
.distinct
261+
.map do |site|
262+
school = schools_by_urn[site.urn]
263+
264+
site.assign_attributes(
265+
gias_establishment_number: school.gias_establishment_number,
266+
gias_local_authority_code: school.gias_local_authority_code,
267+
gias_phase: school.gias_phase,
268+
gias_year_groups: school.gias_year_groups,
269+
status: school.status,
270+
url: school.url
271+
)
272+
273+
site
274+
end
275+
276+
return if sites.empty?
277+
278+
Location.import!(
279+
sites,
280+
on_duplicate_key_update: {
281+
conflict_target: %i[urn site],
282+
columns: %i[
283+
gias_establishment_number
284+
gias_local_authority_code
285+
gias_phase
286+
gias_year_groups
287+
status
288+
url
289+
]
290+
}
291+
)
292+
end
293+
294+
def check_for_school_closure(row, school_set, successors = {})
295+
urn = row["URN"]
296+
new_status = row["EstablishmentStatus (name)"]
297+
298+
if new_status == "Closed"
299+
school_set[:closed][urn] = successors[urn] || []
300+
elsif new_status == "Open, but proposed to close"
301+
school_set[:closing][urn] = successors[urn] || []
302+
end
303+
end
304+
305+
def check_for_year_group_changes(row, school_set, existing_schools)
306+
urn = row["URN"]
307+
return unless urn.in? existing_schools
308+
309+
new_year_groups = process_year_groups(row)
310+
current_year_groups = Location.school.find_by(urn:).gias_year_groups
311+
312+
if new_year_groups != current_year_groups
313+
school_set[:year_group_changes][urn] = {
314+
current: current_year_groups,
315+
new: new_year_groups
316+
}
317+
end
318+
end
319+
end
320+
end

0 commit comments

Comments
 (0)