Skip to content

Commit 3975371

Browse files
committed
Improved importer script
With this change we have multiple options for updating existing records, one of that is the existing body update, the other is reimporting the date from the mbox file.
1 parent 6a661b5 commit 3975371

4 files changed

Lines changed: 60 additions & 11 deletions

File tree

app/services/email_ingestor.rb

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# frozen_string_literal: true
22

33
class EmailIngestor
4-
def ingest_raw(raw_message, fallback_threading: false, trust_date: false)
4+
def ingest_raw(raw_message, fallback_threading: false, trust_date: false, update_existing: [])
55
m = Mail.new(raw_message)
66

77
message_id = clean_reference(m.message_id)
@@ -12,7 +12,7 @@ def ingest_raw(raw_message, fallback_threading: false, trust_date: false)
1212
body = normalize_body(extract_body(m))
1313
existing_message = Message.find_by_message_id(message_id)
1414
if existing_message
15-
existing_message.update_columns(body: body)
15+
update_existing_message(existing_message, body: body, sent_at: sent_at, update_existing: update_existing)
1616
return existing_message
1717
end
1818

@@ -64,6 +64,25 @@ def ingest_raw(raw_message, fallback_threading: false, trust_date: false)
6464

6565
private
6666

67+
def update_existing_message(message, body:, sent_at:, update_existing:)
68+
return if update_existing.empty?
69+
70+
updates = {}
71+
updates[:body] = body if update_existing.include?(:body)
72+
73+
if update_existing.include?(:date) && sent_at && message.created_at != sent_at
74+
updates[:created_at] = sent_at
75+
76+
# Update topic date if this is the first message
77+
topic = message.topic
78+
if topic && topic.messages.order(:created_at).first&.id == message.id
79+
topic.update_columns(created_at: sent_at)
80+
end
81+
end
82+
83+
message.update_columns(updates) if updates.any?
84+
end
85+
6786
def build_from_aliases(m, sent_at)
6887
if m.from.nil? || m.from[0].nil?
6988
name = m[:from].to_s.strip

lib/import_options.rb

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# frozen_string_literal: true
2+
3+
require 'optparse'
4+
5+
module ImportOptions
6+
def self.parse!(argv = ARGV)
7+
options = { update_existing: [] }
8+
9+
OptionParser.new do |opts|
10+
opts.banner = "Usage: #{$PROGRAM_NAME} [options] /path/to/mbox [...]"
11+
12+
opts.on('--update-body', 'Update body of existing messages') do
13+
options[:update_existing] |= [:body]
14+
end
15+
opts.on('--update-date', 'Update date of existing messages') do
16+
options[:update_existing] |= [:date]
17+
end
18+
end.parse!(argv)
19+
20+
options
21+
end
22+
end

script/mbox_import.rb

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
require_relative "../config/environment"
22
require_relative "../app/services/email_ingestor"
3+
require_relative "../lib/import_options"
34

4-
def create_users fields, created_at, limit = 0
5+
options = ImportOptions.parse!
6+
7+
def create_users fields, created_at, limit = 0
58
return [] unless fields
69

710
# Some old mboxes have header fields without parsed addresses
@@ -103,11 +106,12 @@ def sanitize_email_date(mail_date, mail_date_header, message_id)
103106
sanitized_date
104107
end
105108

106-
def parse_message(message)
107-
msg = EmailIngestor.new.ingest_raw(message, fallback_threading: true)
109+
def parse_message(message, update_existing:)
110+
msg = EmailIngestor.new.ingest_raw(message, fallback_threading: true, update_existing: update_existing)
108111
puts "Processing #{msg&.message_id || '(duplicate or invalid)'}"
109112
end
110113

114+
update_existing = options[:update_existing]
111115
message = ""
112116

113117
ARGV.each do |fn|
@@ -121,7 +125,7 @@ def parse_message(message)
121125
# all new messages refer to lists.postgresql.org, but not old emails
122126
# And we can't simply check for From, as it also matches inline attachments containing git diffs
123127
if (line.match(/^From [^@]+@[a-z\d\-]+(\.[a-z\d\-]+)*\.[a-z]+/i))
124-
parse_message(message) unless message.empty?
128+
parse_message(message, update_existing: update_existing) unless message.empty?
125129
message = ""
126130
else
127131
message << line

script/mbox_single_import.rb

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
require_relative "../config/environment"
22
require_relative "../app/services/email_ingestor"
3+
require_relative "../lib/import_options"
4+
5+
options = ImportOptions.parse!
36

47
if ARGV.length != 2
5-
puts "Usage: #{$PROGRAM_NAME} /path/to/mbox <message-id>"
8+
puts "Usage: #{$PROGRAM_NAME} [options] /path/to/mbox <message-id>"
69
exit 1
710
end
811

@@ -21,14 +24,14 @@ def normalize_message_id(message)
2124
''
2225
end
2326

24-
def process_message(message, target_id)
27+
def process_message(message, target_id, update_existing:)
2528
return false if message.empty?
2629

2730
message_id = normalize_message_id(message)
2831
return false if message_id.empty?
2932
return false unless message_id == target_id
3033

31-
msg = EmailIngestor.new.ingest_raw(message, fallback_threading: true)
34+
msg = EmailIngestor.new.ingest_raw(message, fallback_threading: true, update_existing: update_existing)
3235
if msg
3336
puts "Reimported #{msg.message_id}"
3437
else
@@ -37,6 +40,7 @@ def process_message(message, target_id)
3740
true
3841
end
3942

43+
update_existing = options[:update_existing]
4044
found = false
4145
message = ""
4246

@@ -47,7 +51,7 @@ def process_message(message, target_id)
4751
line = line.encode("utf-8", :invalid => :replace)
4852

4953
if line.match(/^From [^@]+@[a-z\d\-]+(\.[a-z\d\-]+)*\.[a-z]+/i)
50-
if process_message(message, target_id)
54+
if process_message(message, target_id, update_existing: update_existing)
5155
found = true
5256
break
5357
end
@@ -59,7 +63,7 @@ def process_message(message, target_id)
5963
end
6064

6165
if !found
62-
found = process_message(message, target_id)
66+
found = process_message(message, target_id, update_existing: update_existing)
6367
end
6468

6569
puts "Message not found in #{mbox_path}" unless found

0 commit comments

Comments
 (0)