quran-json/bin/pull-arabic
2022-04-26 15:26:17 -03:00

76 lines
1.6 KiB
Ruby
Executable file

#!/usr/bin/env ruby
require "net/http"
require "nokogiri"
require "json"
require "paint"
##
# Configuration variables.
base_uri = "www.sacred-texts.com"
path = "/isl/uq/%{htm_file}"
dest_path = File.join(__dir__, "..", "src", "arabic", "%{chapter_num}.json")
chapter_count = 114
cool_off = 5
##
# Share a single Net::HTTP instance.
http = Net::HTTP.new(base_uri, 443)
http.use_ssl = true
##
# Helper method.
def get_htm_file(chapter_num)
if chapter_num < 10
"00#{chapter_num}.htm"
elsif chapter_num < 100
"0#{chapter_num}.htm"
else
"#{chapter_num}.htm"
end
end
##
# Helper method.
def get_request(path, htm_file)
Net::HTTP::Get.new(
format(path, htm_file: htm_file),
"Accept" => "text/html"
)
end
##
# Helper method.
def extract_verses!(res, rows)
doc = Nokogiri::HTML(res.body)
verses = doc.css("table tr td p[align=RIGHT]")
verses.each do |verse|
verse_num = Integer(verse.css("a").inner_text)
verse_txt = verse.text.delete(verse_num.to_s)
rows.push([
verse_num,
verse_txt.delete("\u200F").delete("\u200E").strip
])
end
end
##
# main()
1.upto(chapter_count) do |chapter_num|
htm_file = get_htm_file(chapter_num)
final_dest = format(dest_path, chapter_num:)
rows = []
print "Fetch: ", chapter_num, "\n"
case res = http.request(get_request(path, htm_file))
when Net::HTTPOK
extract_verses!(res, rows)
File.write(final_dest, JSON.pretty_generate(rows))
print Paint["OK: ", :green, :bold], final_dest, "\n"
else
print Paint["ERROR (#{res.class}): ", :red, :bold], final_dest, "\n"
end
print Paint["Chill for #{cool_off} seconds", :blue, :bold], "\n", "\n"
sleep cool_off
end