Skip to content

Commit 1ba8533

Browse files
Refactor GIAS commands into helper function
This will make it easier to create a scheduled Sidekiq job for this task. This also adds more comprehensive testing for these processes. Jira-Issue: MAV-2788
1 parent 17284c8 commit 1ba8533

6 files changed

Lines changed: 510 additions & 296 deletions

File tree

app/lib/gias.rb

Lines changed: 316 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,316 @@
1+
# frozen_string_literal: true
2+
3+
module GIAS
4+
class << self
5+
def download(output_file:)
6+
# 1. Go to https://get-information-schools.service.gov.uk/Downloads
7+
# 2. Check "Establishment fields CSV"
8+
# 3. Check "Establishment links CSV"
9+
# 4. Submit
10+
# 5. Download the zip file
11+
# 6. Move the downloaded file to db/data/dfe-schools.zip
12+
13+
require "mechanize"
14+
15+
agent = Mechanize.new
16+
agent.user_agent_alias = "Mac Safari"
17+
18+
page =
19+
agent.get("https://get-information-schools.service.gov.uk/Downloads")
20+
form = page.form_with(action: "/Downloads/Collate")
21+
form.checkbox_with(id: "establishment-fields-csv-checkbox").check
22+
form.checkbox_with(id: "establishment-links-csv-checkbox").check
23+
download_page = form.submit
24+
25+
wait_time = 0
26+
until (
27+
download_form =
28+
download_page.form_with(action: "/Downloads/Download/Extract")
29+
) || wait_time > 60
30+
sleep(2)
31+
wait_time += 2
32+
download_page = agent.get(download_page.uri)
33+
end
34+
35+
if download_form
36+
download_button = download_form.button_with(value: "Results.zip")
37+
download_file = agent.click(download_button)
38+
download_file.save!(output_file)
39+
true
40+
else
41+
false
42+
end
43+
end
44+
45+
def import(input_file:, progress_bar: nil)
46+
open_csv(input_file) do |rows|
47+
batch_size = 1000
48+
schools = []
49+
50+
rows.each do |row|
51+
gias_establishment_number = row["EstablishmentNumber"]
52+
next if gias_establishment_number.blank?
53+
54+
schools << Location.new(
55+
type: :school,
56+
urn: row["URN"],
57+
gias_local_authority_code: row["LA (code)"],
58+
gias_establishment_number:,
59+
gias_phase: Integer(row["PhaseOfEducation (code)"]),
60+
gias_year_groups: process_year_groups(row),
61+
name: row["EstablishmentName"],
62+
address_line_1: row["Street"],
63+
address_line_2: [
64+
row["Locality"],
65+
row["Address3"]
66+
].compact_blank.join(", "),
67+
address_town: row["Town"],
68+
address_postcode: row["Postcode"],
69+
status: Integer(row["EstablishmentStatus (code)"]),
70+
url: process_url(row["SchoolWebsite"].presence)
71+
)
72+
73+
if schools.size >= batch_size
74+
import_schools(schools)
75+
update_sites(schools)
76+
schools.clear
77+
end
78+
79+
progress_bar&.increment
80+
end
81+
82+
unless schools.empty?
83+
import_schools(schools)
84+
update_sites(schools)
85+
end
86+
end
87+
end
88+
89+
def check_import(input_file:, progress_bar: nil)
90+
schools_with_future_sessions = {
91+
existing:
92+
Set.new(
93+
Location
94+
.school
95+
.joins(:sessions)
96+
.merge(Session.scheduled)
97+
.pluck(:urn)
98+
),
99+
closed: {
100+
},
101+
closing: {
102+
},
103+
year_group_changes: {
104+
}
105+
}
106+
schools_without_future_sessions = {
107+
closed: {
108+
},
109+
closing: {
110+
},
111+
year_group_changes: {
112+
}
113+
}
114+
115+
existing_schools = Set.new(Location.school.pluck(:urn))
116+
team_schools =
117+
Set.new(
118+
TeamLocation
119+
.joins(:location)
120+
.merge(Location.school)
121+
.pluck(:"locations.urn")
122+
)
123+
124+
new_schools = Set.new
125+
126+
Zip::File.open(input_file) do |zip|
127+
links_csv = zip.glob("links_edubasealldata*.csv").first
128+
links_csv_content = links_csv.get_input_stream.read
129+
130+
successors = {}
131+
CSV.parse(
132+
links_csv_content,
133+
headers: true,
134+
encoding: "ISO-8859-1:UTF-8"
135+
) do |row|
136+
next unless row["LinkType"]&.include?("Successor")
137+
138+
successors[row["URN"]] ||= []
139+
successors[row["URN"]] << row["LinkURN"]
140+
end
141+
142+
school_data_csv = zip.glob("edubasealldata*.csv").first
143+
school_csv_content = school_data_csv.get_input_stream.read
144+
145+
CSV.parse(
146+
school_csv_content,
147+
headers: true,
148+
encoding: "ISO-8859-1:UTF-8"
149+
) do |row|
150+
gias_establishment_number = row["EstablishmentNumber"]
151+
next if gias_establishment_number.blank?
152+
153+
urn = row["URN"]
154+
new_status = row["EstablishmentStatus (name)"]
155+
156+
if urn.in?(schools_with_future_sessions[:existing])
157+
check_for_school_closure(row, schools_with_future_sessions, successors)
158+
check_for_year_group_changes(
159+
row,
160+
schools_with_future_sessions,
161+
existing_schools
162+
)
163+
elsif urn.in?(team_schools)
164+
check_for_school_closure(
165+
row,
166+
schools_without_future_sessions,
167+
successors
168+
)
169+
check_for_year_group_changes(
170+
row,
171+
schools_without_future_sessions,
172+
existing_schools
173+
)
174+
elsif !urn.in?(existing_schools) &&
175+
new_status.in?(["Open", "Open, but proposed to close"])
176+
new_schools << urn
177+
end
178+
ensure
179+
progress_bar&.increment
180+
end
181+
end
182+
183+
{
184+
new_schools:,
185+
schools_with_future_sessions:,
186+
schools_without_future_sessions:
187+
}
188+
end
189+
190+
def process_url(url)
191+
return nil if url.blank?
192+
193+
# Legh Vale school has a URL of http:www.leghvale.st-helens.sch.uk
194+
# which is not a valid URL.
195+
url = url.gsub("http:www", "http://www")
196+
197+
# Some school URLs don't start with http:// and https://
198+
url.start_with?("http://", "https://") ? url : "https://#{url}"
199+
end
200+
201+
def process_year_groups(row)
202+
low_year_group = row["StatutoryLowAge"].to_i - 4
203+
high_year_group = row["StatutoryHighAge"].to_i - 5
204+
(low_year_group..high_year_group).to_a
205+
end
206+
207+
def row_count(input_file)
208+
Zip::File.open(input_file) do |zip|
209+
csv_entry = zip.glob("edubasealldata*.csv").first
210+
csv_entry.get_input_stream.read.lines.count
211+
end
212+
end
213+
214+
private
215+
216+
def open_csv(input_file)
217+
Zip::File.open(input_file) do |zip|
218+
csv_entry = zip.glob("edubasealldata*.csv").first
219+
csv_content = csv_entry.get_input_stream.read
220+
rows =
221+
CSV.parse(csv_content, headers: true, encoding: "ISO-8859-1:UTF-8")
222+
yield rows
223+
end
224+
end
225+
226+
def import_schools(schools)
227+
Location.import!(
228+
schools,
229+
on_duplicate_key_update: {
230+
conflict_target: %i[urn],
231+
index_predicate: "site IS NULL",
232+
columns: %i[
233+
address_line_1
234+
address_line_2
235+
address_postcode
236+
address_town
237+
gias_establishment_number
238+
gias_local_authority_code
239+
gias_phase
240+
gias_year_groups
241+
name
242+
status
243+
url
244+
]
245+
}
246+
)
247+
end
248+
249+
def update_sites(schools)
250+
schools_by_urn = schools.index_by(&:urn)
251+
252+
sites =
253+
Location
254+
.where(urn: schools_by_urn.keys)
255+
.where.not(site: nil)
256+
.distinct
257+
.map do |site|
258+
school = schools_by_urn[site.urn]
259+
260+
site.assign_attributes(
261+
gias_establishment_number: school.gias_establishment_number,
262+
gias_local_authority_code: school.gias_local_authority_code,
263+
gias_phase: school.gias_phase,
264+
gias_year_groups: school.gias_year_groups,
265+
status: school.status,
266+
url: school.url
267+
)
268+
269+
site
270+
end
271+
272+
return if sites.empty?
273+
274+
Location.import!(
275+
sites,
276+
on_duplicate_key_update: {
277+
conflict_target: %i[urn site],
278+
columns: %i[
279+
gias_establishment_number
280+
gias_local_authority_code
281+
gias_phase
282+
gias_year_groups
283+
status
284+
url
285+
]
286+
}
287+
)
288+
end
289+
290+
def check_for_school_closure(row, school_set, successors = {})
291+
urn = row["URN"]
292+
new_status = row["EstablishmentStatus (name)"]
293+
294+
if new_status == "Closed"
295+
school_set[:closed][urn] = successors[urn] || []
296+
elsif new_status == "Open, but proposed to close"
297+
school_set[:closing][urn] = successors[urn] || []
298+
end
299+
end
300+
301+
def check_for_year_group_changes(row, school_set, existing_schools)
302+
urn = row["URN"]
303+
return unless urn.in? existing_schools
304+
305+
new_year_groups = process_year_groups(row)
306+
current_year_groups = Location.school.find_by(urn:).gias_year_groups
307+
308+
if new_year_groups != current_year_groups
309+
school_set[:year_group_changes][urn] = {
310+
current: current_year_groups,
311+
new: new_year_groups
312+
}
313+
end
314+
end
315+
end
316+
end

0 commit comments

Comments
 (0)