view · edit · sidebar · attach · print · history

2026

2.2.2026

  • use xargs
 xargs -a data/text/today bundle exec ruby jobs/update_textinfo_swissmedicinfo --skip --target=both --reparse

27.1.2026

  • check the file
 ruby -e 'f=ARGV[0]; c=File.read(f); n={p:0,d:0,b:0,w:0}; s=c.gsub(/<p[^>]*>.*?(▼|�).*?<\/p>/m){n[:p]+=1; ""}.gsub("·"){n[:d]+=1; "-"}.gsub("•"){n[:b]+=1; "-"}.gsub(/>\s+</){n[:w]+=1; "><"}; puts "Removed #{n[:p]} ▼ paragraphs, #{n[:d]} dots, #{n[:b]} bullets, #{n[:w]} whitespaces"; File.write(f,s) if s!=c' 38837_Trasylol_.html
  • check the chapters
 bundle exec ruby -I./src -I./ext/fiparse/src -rnokogiri -e 'file = ARGV[0]; doc = Nokogiri::HTML(File.read(file)); (1..5).each {|i| s = doc.at("#section#{i}"); puts "\n=== Section#{i} ==="; puts "Text (first 150 chars): " + s.inner_text[0..150].inspect if s}' data/details/fi/de/38837_Trasylol_.html
  • check sections
 bundle exec ruby -rnokogiri -e 'file = ARGV[0]; doc = Nokogiri::HTML(File.read(file)); puts "P tags: " + doc.search("p").length.to_s; puts "Divs with section id: " + doc.search("div[@id^=\"section\"]").length.to_s; doc.search("*[@id^=\"section\"]")[0..5].each {|e| puts "  #{e.name}##{e["id"]}: #{e.inner_text[0..30]}"}' data/details/fi/de/38837_Trasylol_.html
  • id check
 bundle exec ruby -rnokogiri -e 'file = ARGV[0]; doc = Nokogiri::HTML(File.read(file)); body = doc.at("body"); puts "Total p tags in body: " + body.search("p").length.to_s; puts "P tags with section id: " + body.search("p[@id^=\"section\"]").length.to_s; puts "\nProcessing by extract method:"; body.search("p").each_with_index {|p, i| break if i > 10; id = p["id"] || "none"; text = p.inner_text[0..40]; puts "  #{i}: id=#{id} text=#{text}"}' data/details/fi/de/38837_Trasylol_.html
  • would process
 bundle exec ruby -rnokogiri -e 'doc = Nokogiri::HTML(File.read(ARGV[0])); format = :swissmedicinfo; doc.search("p").each {|elem| next unless elem.attributes["id"]&.value&.match?(/^section/i); puts "Would process: #{elem["id"]} - #{elem.inner_text[0..40]}"}' data/details/fi/de/38837_Trasylol_.html | head -10
  • good test
 bundle exec ruby -I./src -I./ext/fiparse/src -rnokogiri -e 'doc = Nokogiri::HTML(File.read(ARGV[0])); composition_section = doc.search("*[@id^=\"section\"]").find {|elem| elem.inner_text.strip.match?(/^Zusammensetzung|^Composition/)}; if composition_section; match = composition_section["id"].match(/section(\d+)/i); if match; current_num = match[1].to_i; puts "Found Zusammensetzung at section#{current_num}"; if current_num != 1; doc.search("*[@id^=\"section\"]").each do |elem|; if m = elem["id"].match(/section(\d+)/i); num = m[1].to_i; if num < current_num; elem.remove; else; new_num = num - current_num + 1; elem["id"] = "section#{new_num}"; end; end; end; end; puts "\nAfter normalization:"; doc.search("*[@id^=\"section\"]")[0..4].each {|e| puts "  #{e["id"]}: #{e.inner_text[0..40]}"}; end; end' data/details/fi/de/38837_Trasylol_.html 

what does the parser see?

 bundle exec ruby -I./src -I./ext/fiparse/src -rnokogiri -e "file = ARGV[0]; doc = Nokogiri::HTML(File.read(file)); puts 'Format detected: ' + (doc.to_s.index('section1') ? 'swissmedicinfo' : 'other'); puts 'Title: ' + doc.at('title')&.text.to_s; puts 'Section1: ' + doc.at('#section1')&.text.to_s" data/details/fi/de/45510_Furospir_Filmtabletten.html

5.1.2026

  • some GnuCash errors.
 https://github.com/flathub/org.gnucash.GnuCash/issues/106

1.1.2026

  • Frohes neues Jahr!

Verzeichnis nach Jahren

view · edit · sidebar · attach · print · history
Page last modified on February 02, 2026, at 02:19 PM