最近一直在做域名筛选,因为之前一直在学javaweb,但是来到这个公司,一直是让我去截取数据。
所以自己也做下笔记吧..
先上图再说需求吧
需求:
如:把www.baidu.com 或者news.baidu.com,截取分布拆成
即,先去除www和后缀com/net/cn/com.cn....
然后输入结果为:.baidu.(根域名)news.(二级)news.baidu.(二级全域名)
实现步骤...
1.先把数据,即url除去www/后缀
2.使用java代码去跑,分析判断,如果遇到4级、5级域名,就重复跑(即把含有几级的数据,跑到只剩下根域名)
下面是主要代码吧。有点乱,做的过程我自己也很懵逼...
package com.glj.quchong; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.text.SimpleDateFormat; import java.util.Date; import comglj.utils.CharUtil; import comglj.utils.JDBCUtil; public class TestJDBC { public static void main(String[] args) { try{ Class.forName("com.mysql.jdbc.Driver"); Connection conn = JDBCUtil.getConnection(); Statement s = conn.createStatement(); String sql = "select * FROM sanji"; ResultSet rs = s.executeQuery(sql); while (rs.next()) { String id = rs.getString(1); String tac = rs.getString(2); String host = rs.getString(3); String is_reg = rs.getString(4); String host_id = rs.getString(5); String pass_time = ""; SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");//设置日期格式 pass_time = df.format(new Date()); //判断第一个.不在开头and最后一个.不在域名最后面 if(host.indexOf(".")>0&&host.indexOf(".")+1<host.length()){ //一级域名 // System.out.println(host+"--"+host.substring(0,host.indexOf(".")+1)); String hostByyiji = host.substring(0,host.indexOf(".")+1); String sqlByyiji = "insert into yiji (ID,TAC,HOST,IS_REG,HOST_ID,PASS_TIME) " + "values('"+id+"','"+tac+"','"+hostByyiji+"','"+is_reg+"','"+host_id+"','"+pass_time+"')"; Statement s1 = conn.createStatement(); s1.execute(sqlByyiji); //包含一级二级域名,再次去筛选截取 String hostByerji = host.substring(host.indexOf("."),host.length()); if(CharUtil.CharNumber(hostByerji)>2){ //二级及其以上 // System.out.println(hostByerji); hostByerji = host.substring(host.indexOf(".")+1,host.length()); // System.out.println(host+"----"+host.substring(host.indexOf(".")+1,host.length())); String sqlByerji = "insert into siji (ID,TAC,HOST,IS_REG,HOST_ID,PASS_TIME) " + "values('"+id+"','"+tac+"','"+hostByerji+"','"+is_reg+"','"+host_id+"','"+pass_time+"')"; Statement sterji = conn.createStatement(); sterji.execute(sqlByerji); }else{ //根域名 // System.out.println(hostByerji); String sqlBygeng = "insert into geng (ID,TAC,HOST,IS_REG,HOST_ID,PASS_TIME) " + "values('"+id+"','"+tac+"','"+hostByerji+"','"+is_reg+"','"+host_id+"','"+pass_time+"')"; Statement st1 = conn.createStatement(); st1.execute(sqlBygeng); } }else{ //根域名 // System.out.println(host); String sqlBygeng = "insert into geng (ID,TAC,HOST,IS_REG,HOST_ID,PASS_TIME) " + "values('"+id+"','"+tac+"','"+host+"','"+is_reg+"','"+host_id+"','"+pass_time+"')"; Statement st1 = conn.createStatement(); st1.execute(sqlBygeng); } } }catch(ClassNotFoundException | SQLException e){ e.printStackTrace(); } } }
下面代码是判断域名里面还有几个‘.’即可判断还有几级没有拆分
package comglj.utils; public class CharUtil { public static int CharNumber(String url){ //需要查找重复的字或者符号 char c = '.'; int i,index=-1,count=0; for(i=0;i<url.length();i++) if((url.indexOf(c,index+1))!=-1){ index=url.indexOf(c,index+1); count++; } return count; } }
数据库类
package comglj.utils; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; public class JDBCUtil { private static String url; private static String username; private static String password; public JDBCUtil() { super(); // TODO Auto-generated constructor stub } static { try { Class.forName("com.mysql.jdbc.Driver"); url = "jdbc:mysql://127.0.0.1:3306/work?characterEncoding=UTF-8"; username = "root"; password = "root"; } catch (Exception e) { // TODO: handle exception } } public static Connection getConnection() throws SQLException{ Connection con = DriverManager.getConnection(url,username,password); return con; } public static void main(String[] args) { try { Connection conn = JDBCUtil.getConnection(); String sql = "select * from k2"; PreparedStatement ps = conn.prepareStatement(sql); ResultSet rs = ps.executeQuery(sql); while(rs.next()){ String name = rs.getString("NAME"); System.out.println(name); } } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
最后在用sql语句 统计拆分出现的次数
SELECT MFR_NAM_ID as ID,TAC,RULE as 域名,count(1) as 次数 from x5
GROUP BY MFR_NAM_ID,TAC,RULE ORDER BY count(1) DESC