ruby+watir 实现断点续传

寻技术 Ruby编程 2023年07月12日 157
$KCODE = "e"
require "watir"
$path = "http://www.baidu.com/"
$ary = Array.new
file = File.open("D:/20130409-专家信息系统-资源采集/专家清单.txt","r")
array = file.readlines
$fw = File.new("D:/专家清单-续断.txt","a")
for n in 0...array.size
$ary.push(array[n].gsub(/<#>|\n/," ").strip)
end

#~ #打开ie
def ie_open
  $ie = Watir::IE.new
  $ie.goto($path)
end

#~ #从第id次开始采集专家信息
def ie_write(id)
  for j in id.to_i...$ary.size
  $ie.text_field(:name,"wd").set $ary[j]
  $ie.button(:id,"su").click
  i = $ie.tables.to_a.size-2
  $fw.puts $ary[j]
  $fw.puts "----------------------------------"
  for n in 1..i
  if n <= 2
  table = $ie.table(:id,"#{n}") ##和table = $ie.tables[i]相同
  #~ table = $ie.tables[n]
  array = table[1][1].text.split(/\r\n/)
  title = array[0]
  abs = ""
  for m in 1...array.size-1
  abs = abs+array[m]
  end
  ary = array[array.size-1].to_s.split(/\s/)
  url = ary[0]
  date = ary.to_s.scan(/\d{4}-\d{1,2}-\d{1,2}/).to_s
  $fw.puts "[title]"+title
  $fw.puts "[abs]"+abs
  $fw.puts "[url]"+url
  $fw.puts "[date]#{date}"
  $fw.puts "===================================="
  end
  end
  $note_file.puts j.to_s 
  end
end

#~ #初始化采集
def start
  ie_open
  $fw = File.new("D:/专家清单-续断.txt","a")
  $note_file = File.new("日志文件.txt","a")
  arr = File.new("日志文件.txt").readlines.pop.to_s.scan(/\d{1,}/)
  if arr==[]
    arr[0] = 0
    else
      arr[0]=arr[0].to_i+1
    end
    ie_write(arr[0])
  end
  
 #~ # 自动重新启动
while true
    $a_run  = true
    a = Thread.new do
      begin
        start
      rescue
        puts "=====重新启动====="
        puts Time.now
        $a_run = false
        $fw.close
        $note_file.close
        $ie.close  #关闭所有的IE窗口
        puts "已经关闭全部IE进程!等待重新启动···"
      end
      #~ puts "======="
      #~ puts Time.now
    end
    while true
      abc = 1
      break if $a_run == false
      sleep(3)
      abc = abc +1
      break if abc > 10
    end
    sleep(10)
  end

 

要求:

1.利用ruby+watir进行互联网海量资源采集;
2.模拟人工方式,动态输入关键字在百度中搜索相关文档内容;
3.解析网页内容并抽取相关信息,采集样例:

扩展:

程序实现断点续传问题(提示:添加日志文件,纪录断点的位置,利用线程自动重启网页)

 

关闭

用微信“扫一扫”