|
1 | 1 | package com.olegshan.parser;
|
2 | 2 |
|
3 |
| -import com.olegshan.entity.Job; |
4 |
| -import com.olegshan.service.JobService; |
5 |
| -import com.olegshan.sites.*; |
6 |
| -import com.olegshan.tools.MonthsTools; |
7 |
| -import org.jsoup.Jsoup; |
8 |
| -import org.jsoup.nodes.Document; |
9 |
| -import org.jsoup.nodes.Element; |
10 |
| -import org.jsoup.select.Elements; |
11 |
| -import org.slf4j.Logger; |
12 |
| -import org.slf4j.LoggerFactory; |
13 |
| -import org.springframework.beans.factory.annotation.Autowired; |
14 |
| -import org.springframework.stereotype.Component; |
15 |
| - |
16 |
| -import java.io.IOException; |
17 |
| -import java.time.LocalDate; |
18 |
| -import java.time.LocalDateTime; |
19 |
| -import java.time.LocalTime; |
20 |
| -import java.time.ZoneId; |
| 3 | +import com.olegshan.sites.JobSite; |
21 | 4 |
|
22 | 5 | /**
|
23 | 6 | * Created by olegshan on 03.10.2016.
|
24 | 7 | */
|
25 |
| -@Component |
26 |
| -public class Parser { |
27 |
| - |
28 |
| - private static final Logger LOGGER = LoggerFactory.getLogger(Parser.class); |
29 |
| - @Autowired |
30 |
| - private JobService jobService; |
31 |
| - |
32 |
| - public void parse(JobSite jobSite) { |
33 |
| - |
34 |
| - Document doc = getDoc(jobSite.getSiteUrl()); |
35 |
| - Elements jobBlocks = getJobBlocks(jobSite, doc); |
36 |
| - |
37 |
| - for (Element job : jobBlocks) { |
38 |
| - Elements titleBlock = getTitleBlock(jobSite, job); |
39 |
| - String url = jobSite.getUrlPrefix() + titleBlock.attr("href"); |
40 |
| - String title = getTitle(titleBlock); |
41 |
| - String description = getDescription(jobSite, job); |
42 |
| - String company = getCompany(jobSite, job, url); |
43 |
| - LocalDateTime date = getDate(jobSite, job, url, titleBlock); |
44 |
| - |
45 |
| - Job parsedJob = new Job(title, description, company, jobSite.getSiteName(), url, date); |
46 |
| - jobService.save(parsedJob); |
47 |
| - } |
48 |
| - LOGGER.info("Parsing of {} completed", jobSite.getSiteName()); |
49 |
| - } |
50 |
| - |
51 |
| - private Document getDoc(String siteUrl) { |
52 |
| - try { |
53 |
| - return Jsoup.connect(siteUrl).userAgent("Mozilla").timeout(0).get(); |
54 |
| - } catch (IOException e) { |
55 |
| - LOGGER.error("Connecting to {} failed", siteUrl); |
56 |
| - throw new RuntimeException("Connection failed to " + siteUrl); |
57 |
| - } |
58 |
| - } |
59 |
| - |
60 |
| - private Elements getJobBlocks(JobSite jobSite, Document doc) { |
61 |
| - String[] jobBox = jobSite.getJobBox(); |
62 |
| - if (jobSite instanceof WorkUa) { |
63 |
| - return doc.getElementsByAttributeValueStarting(jobBox[0], jobBox[1]); |
64 |
| - } else if (jobSite instanceof RabotaUa) { |
65 |
| - return getJobBlocksForRabotaUa(doc, jobBox); |
66 |
| - } else { |
67 |
| - return doc.getElementsByAttributeValue(jobBox[0], jobBox[1]); |
68 |
| - } |
69 |
| - } |
70 |
| - |
71 |
| - private Elements getJobBlocksForRabotaUa(Document doc, String[] jobBox) { |
72 |
| - Elements jobBlocks = new Elements(); |
73 |
| - for (int i = 1; i < jobBox.length; i++) { |
74 |
| - Elements jobElements = doc.getElementsByAttributeValue(jobBox[0], jobBox[i]); |
75 |
| - if (jobElements != null && !jobElements.isEmpty()) { |
76 |
| - jobBlocks.addAll(jobElements); |
77 |
| - } |
78 |
| - } |
79 |
| - return jobBlocks; |
80 |
| - } |
81 |
| - |
82 |
| - private Elements getTitleBlock(JobSite jobSite, Element job) { |
83 |
| - String[] titleBox = jobSite.getTitleBox(); |
84 |
| - if (jobSite instanceof WorkUa) { |
85 |
| - return job.getElementsByTag("a"); |
86 |
| - } else { |
87 |
| - return job.getElementsByAttributeValue(titleBox[0], titleBox[1]); |
88 |
| - } |
89 |
| - } |
90 |
| - |
91 |
| - private String getTitle(Elements titleBlock) { |
92 |
| - String title = titleBlock.text(); |
93 |
| - if (title.endsWith("Горячая")) { |
94 |
| - title = title.substring(0, title.length() - "Горячая".length()); |
95 |
| - } |
96 |
| - return title; |
97 |
| - } |
98 |
| - |
99 |
| - private String getDescription(JobSite jobSite, Element job) { |
100 |
| - String[] descriptionData = jobSite.getDescriptionData(); |
101 |
| - return job.getElementsByAttributeValue(descriptionData[0], descriptionData[1]).text(); |
102 |
| - } |
103 |
| - |
104 |
| - private LocalDateTime getDate(JobSite jobSite, Element job, String url, Elements titleBlock) { |
105 |
| - String[] dateData = jobSite.getDateData(); |
106 |
| - String dateLine; |
107 |
| - if (jobSite instanceof DouUa) { |
108 |
| - Document dateDoc = getDoc(url); |
109 |
| - dateLine = dateDoc.getElementsByAttributeValue(dateData[0], dateData[1]).text(); |
110 |
| - return getDateByLine(jobSite, dateLine); |
111 |
| - } else if (jobSite instanceof RabotaUa) { |
112 |
| - return getDateForRabotaUa(url); |
113 |
| - } else { |
114 |
| - if (jobSite instanceof WorkUa) { |
115 |
| - dateLine = titleBlock.attr("title"); |
116 |
| - } else { |
117 |
| - dateLine = job.getElementsByAttributeValue(dateData[0], dateData[1]).text(); |
118 |
| - } |
119 |
| - return getDateByLine(jobSite, dateLine); |
120 |
| - } |
121 |
| - } |
122 |
| - |
123 |
| - private LocalDateTime getDateByLine(JobSite jobSite, String dateLine) { |
124 |
| - String[] dateParts; |
125 |
| - int year; |
126 |
| - int month; |
127 |
| - int day; |
128 |
| - String split = jobSite.getSplit(); |
129 |
| - if (jobSite instanceof JobsUa) { |
130 |
| - dateLine = dateLine.substring(0, 10); |
131 |
| - } |
132 |
| - if (jobSite instanceof WorkUa) { |
133 |
| - dateLine = dateLine.substring(dateLine.length() - 8); |
134 |
| - } |
135 |
| - dateParts = dateLine.split(split); |
136 |
| - MonthsTools.removeZero(dateParts); |
137 |
| - day = Integer.parseInt(dateParts[0]); |
138 |
| - if (jobSite instanceof HeadHunterUa) { |
139 |
| - year = LocalDate.now(ZoneId.of("Europe/Athens")).getYear(); |
140 |
| - } else year = Integer.parseInt(dateParts[2]); |
141 |
| - if (jobSite instanceof WorkUa) { |
142 |
| - year = year + 2000; |
143 |
| - } |
144 |
| - if (jobSite instanceof DouUa || jobSite instanceof HeadHunterUa) { |
145 |
| - month = MonthsTools.MONTHS.get(dateParts[1].toLowerCase()); |
146 |
| - } else month = Integer.parseInt(dateParts[1]); |
147 |
| - return LocalDate.of(year, month, day).atTime(getTime()); |
148 |
| - } |
149 |
| - |
150 |
| - private LocalDateTime getDateForRabotaUa(String url) { |
151 |
| - /* |
152 |
| - * There are several problems here. |
153 |
| - * First: there are two types of date tags, used on rabota.ua on different pages: "d-date" and "datePosted". |
154 |
| - * Second: sometimes date format is dd.mm.yyyy and sometimes — yyyy-mm-dd. |
155 |
| - * Third: sometimes there is no date at all. |
156 |
| - */ |
157 |
| - Document dateDoc = getDoc(url); |
158 |
| - String dateLine; |
159 |
| - String[] dateParts; |
160 |
| - int year; |
161 |
| - int month; |
162 |
| - int day; |
163 |
| - |
164 |
| - Elements dateElements = dateDoc.getElementsByAttributeValue("id", "d-date"); |
165 |
| - if (!dateElements.isEmpty()) { |
166 |
| - dateLine = dateElements.get(0).getElementsByAttributeValue("class", "d-ph-value").text(); |
167 |
| - } else { |
168 |
| - dateLine = dateDoc.getElementsByAttributeValue("itemprop", "datePosted").text(); |
169 |
| - if (dateLine.length() == 0) { |
170 |
| - //no date at all, sometimes it happens |
171 |
| - LocalDateTime ldt = LocalDateTime.now(ZoneId.of("Europe/Athens")); |
172 |
| - LOGGER.debug("There was no date on Rabota.ua, return {}", ldt); |
173 |
| - return ldt; |
174 |
| - } |
175 |
| - } |
176 |
| - try { |
177 |
| - dateParts = dateLine.split("\\."); |
178 |
| - MonthsTools.removeZero(dateParts); |
179 |
| - year = Integer.parseInt(dateParts[2]); |
180 |
| - month = Integer.parseInt(dateParts[1]); |
181 |
| - day = Integer.parseInt(dateParts[0]); |
182 |
| - |
183 |
| - } catch (ArrayIndexOutOfBoundsException e) { |
184 |
| - |
185 |
| - dateParts = dateLine.split("-"); |
186 |
| - MonthsTools.removeZero(dateParts); |
187 |
| - year = Integer.parseInt(dateParts[0]); |
188 |
| - month = Integer.parseInt(dateParts[1]); |
189 |
| - day = Integer.parseInt(dateParts[2]); |
190 |
| - } |
191 |
| - return LocalDate.of(year, month, day).atTime(getTime()); |
192 |
| - } |
193 |
| - |
194 |
| - private LocalTime getTime() { |
195 |
| - return LocalTime.now(ZoneId.of("Europe/Athens")); |
196 |
| - } |
197 |
| - |
198 |
| - private String getCompany(JobSite jobSite, Element job, String url) { |
199 |
| - String[] companyData = jobSite.getCompanyData(); |
200 |
| - if (jobSite instanceof JobsUa || jobSite instanceof WorkUa) { |
201 |
| - Document jobDoc = getDoc(url); |
202 |
| - Elements companyBlock = jobDoc.getElementsByAttributeValue(companyData[0], companyData[1]); |
203 |
| - return companyBlock.get(0).getElementsByTag("a").first().text(); |
204 |
| - } else { |
205 |
| - return job.getElementsByAttributeValue(companyData[0], companyData[1]).text(); |
206 |
| - } |
207 |
| - } |
| 8 | +public interface Parser { |
| 9 | + void parse(JobSite jobSite); |
208 | 10 | }
|
0 commit comments