# Author: Benediktas Cicėnas # Company: Kolmisoft # Year: 2020 # About: Script downloads attachment files from links def print_to_log(message) `echo -e "#{message}" >> #{ARGV[2]}` end begin require 'open-uri' require 'mechanize' require 'mysql2' require 'nokogiri' rescue => error print_to_log(error.message) end # Cuts out part of text or web page def cut_message(message, rule) string_start = rule['string_start'] string_end = rule['string_end'] begin message = message.body if message.class == Mechanize::Page # Converts Mechanize::Page class object to html string message = message.split(string_start, 2)[1] if string_start != "" message = message.split(string_end, 2)[0] if string_end != "" rescue => error print_to_log("error.message}") end message end # Checks if tariff link attachment rules cannot be applied def rule_will_not_apply?(message, rule) (rule['string_start'] != "" && !message.include?(rule['string_start'])) || (rule['string_end'] != "" && !message.include?(rule['string_end'])) || (rule['string_start'] != "" && message.include?(rule['string_start']) && !message.split(rule['string_start'], 2)[1].include?(rule['string_end'])) end # Get links from text or a web page def get_links(message, rule) links = [] message_cutted = cut_message(message, rule) begin # If message is String(Email text) - extract links from text if message.class == String links = URI.extract(message_cutted).reject { |link| !link.include?('//') }.uniq end # If message is Web Page if message.class == Mechanize::Page message_links = message.links # get all the links from the web page if rule['string_start'] != "" || rule['string_end'] != "" # if page was cutted doc = Nokogiri::HTML(message_cutted) # transform cutted html text into nokogiri html doc_links = doc.css('a').map { |link| link['href'] }.uniq # extract all the links from cutted html message_links = message_links.reject { |link| !doc_links.include?(link.uri.to_s) } # reject unnecessary links end links = message_links.map { |link| link.resolved_uri.to_s }.uniq # Return fully resolved links(transfrom relative to absolute) end # reject links that don't match the pattern links = links.reject { |link| link.match(Regexp.escape(rule['link_pattern'])).nil? } if rule['link_pattern'] != '' rescue => error print_to_log(error.message) end links end # Gets tariff link attachment rules def get_rules results = nil attempt = 0 begin attempt += 1 print_to_log("Fetching Tariff Attachment Rules from DB. Attempt #{attempt}") client = Mysql2::Client.new(host: ARGV[3], username: ARGV[4], password: ARGV[5], database: ARGV[6]) query = 'SELECT name, string_start, string_end, link_pattern FROM tariff_link_attachment_rules ORDER BY priority ASC' results = client.query(query) rescue Mysql2::Error => error print_to_log("Mysql2::Error: #{error.error_number}") retry if attempt < 3 print_to_log("Fetch Failed") if attempt == 3 end results end @used_links = [] # Downloads attachments from links def download_links(message, rules, agent, depth = 0, link_index = 0) begin return 0 if depth > 4 print_to_log("Extracting links. Parent link Depth #{depth}_#{link_index}") links = [] rules.each do |rule| next if rule_will_not_apply?(message.class == Mechanize::Page ? message.body : message, rule) links << get_links(message, rule) end new_links = links.flatten.uniq print_to_log("Links found: #{new_links.length}") new_links.each_with_index do |link, index| next if @used_links.include?(link) @used_links << link # Try to open link attempt = 0 begin attempt += 1 print_to_log("Processing link: #{link}. Link depth: #{depth + 1}_#{link_index}_#{index}, attempt #{attempt}") active_link = agent.get(link) rescue => error print_to_log("Error opening the link: #{error.message}, attempt #{attempt}") retry if attempt < 3 if attempt == 3 print_to_log("Skipping link. Attempt #{attempt}") next end end # If link is a file - save it if active_link.class == Mechanize::File && !active_link.filename.match('\.zip$|\.csv$|\.xlsx$|\.xls$|\.rar$').nil? file_to_save = "#{ARGV[1]}/#{active_link.filename}" print_to_log("Attachment found. Saving to #{file_to_save}") active_link.save(file_to_save) end # if link is a web page - call this function again and check for attachment links there if active_link.class == Mechanize::Page download_links(active_link, rules, agent, depth + 1 , index) end end rescue => error print_to_log("Error: #{error.message}") end end print_to_log("Link attachment download script - start") rules = get_rules if !rules.nil? && rules.count > 0 begin agent = Mechanize.new agent.user_agent_alias = 'Linux Firefox' print_to_log("Tariff link attachment rules found") download_links(ARGV[0], rules, agent) agent.shutdown rescue => error print_to_log("Error: #{error.message}") end else print_to_log("No tariff link attachment rules where found. Skipping download...") end print_to_log("Link attachment download script - end")