Skip to content

Commit 95733f0

Browse files
authored
Merge pull request olegshan#1 from Antrakos/master
Design changes and recommendations
2 parents a236645 + 9076b4d commit 95733f0

21 files changed

+483
-372
lines changed

pom.xml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,18 +40,18 @@
4040
<artifactId>spring-boot-starter-data-jpa</artifactId>
4141
</dependency>
4242

43-
<!--<dependency>
44-
<groupId>org.springframework.boot</groupId>
45-
<artifactId>spring-boot-starter-tomcat</artifactId>
46-
<scope>provided</scope>
47-
</dependency>-->
48-
4943
<dependency>
5044
<groupId>org.postgresql</groupId>
5145
<artifactId>postgresql</artifactId>
5246
<version>9.4.1211.jre7</version>
5347
</dependency>
5448

49+
<dependency>
50+
<groupId>org.projectlombok</groupId>
51+
<artifactId>lombok</artifactId>
52+
<version>1.16.10</version>
53+
</dependency>
54+
5555
<dependency>
5656
<groupId>org.springframework.boot</groupId>
5757
<artifactId>spring-boot-starter-test</artifactId>

src/main/java/com/olegshan/controllers/ParseController.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,12 @@
2020
public class ParseController {
2121

2222
private static final int PAGE_SIZE = 40;
23+
private JobService jobService;
2324

2425
@Autowired
25-
JobService jobService;
26+
public ParseController(JobService jobService) {
27+
this.jobService = jobService;
28+
}
2629

2730
@RequestMapping(value = "/", method = RequestMethod.GET)
2831
public ModelAndView showJobs(@RequestParam(value = "page", required = false) Integer page) {
Lines changed: 7 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
package com.olegshan.entity;
22

3+
import lombok.Data;
4+
35
import javax.persistence.Column;
46
import javax.persistence.Entity;
57
import javax.persistence.Id;
68
import java.time.LocalDateTime;
7-
import java.time.format.DateTimeFormatter;
9+
10+
import static java.time.format.DateTimeFormatter.ofPattern;
811

912
/**
1013
* Created by olegshan on 24.09.2016.
1114
*/
1215
@Entity
16+
@Data
1317
public class Job {
1418

1519
@Id
@@ -23,68 +27,16 @@ public class Job {
2327
private LocalDateTime date;
2428
private String dateToDisplay;
2529

26-
public Job() {
27-
}
28-
29-
public Job(String title, String description, String company, String source, String url, LocalDateTime date) {
30-
this.title = title;
31-
this.description = description;
32-
this.company = company;
33-
this.source = source;
30+
public Job(String url, String title, String description, String company, String source, LocalDateTime date) {
3431
this.url = url;
35-
this.date = date;
36-
}
37-
38-
public String getTitle() {
39-
return title;
40-
}
41-
42-
public void setTitle(String title) {
4332
this.title = title;
44-
}
45-
46-
public String getDescription() {
47-
return description;
48-
}
49-
50-
public void setDescription(String description) {
5133
this.description = description;
52-
}
53-
54-
public String getCompany() {
55-
return company;
56-
}
57-
58-
public void setCompany(String company) {
5934
this.company = company;
60-
}
61-
62-
public String getSource() {
63-
return source;
64-
}
65-
66-
public void setSource(String source) {
6735
this.source = source;
68-
}
69-
70-
public String getUrl() {
71-
return url;
72-
}
73-
74-
public void setUrl(String url) {
75-
this.url = url;
76-
}
77-
78-
public LocalDateTime getDate() {
79-
return date;
80-
}
81-
82-
public void setDate(LocalDateTime date) {
8336
this.date = date;
8437
}
8538

8639
public String getDateToDisplay() {
87-
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("d MMMM");
88-
return date.format(formatter);
40+
return date.format(ofPattern("d MMMM"));
8941
}
9042
}
Lines changed: 3 additions & 201 deletions
Original file line numberDiff line numberDiff line change
@@ -1,208 +1,10 @@
11
package com.olegshan.parser;
22

3-
import com.olegshan.entity.Job;
4-
import com.olegshan.service.JobService;
5-
import com.olegshan.sites.*;
6-
import com.olegshan.tools.MonthsTools;
7-
import org.jsoup.Jsoup;
8-
import org.jsoup.nodes.Document;
9-
import org.jsoup.nodes.Element;
10-
import org.jsoup.select.Elements;
11-
import org.slf4j.Logger;
12-
import org.slf4j.LoggerFactory;
13-
import org.springframework.beans.factory.annotation.Autowired;
14-
import org.springframework.stereotype.Component;
15-
16-
import java.io.IOException;
17-
import java.time.LocalDate;
18-
import java.time.LocalDateTime;
19-
import java.time.LocalTime;
20-
import java.time.ZoneId;
3+
import com.olegshan.sites.JobSite;
214

225
/**
236
* Created by olegshan on 03.10.2016.
247
*/
25-
@Component
26-
public class Parser {
27-
28-
private static final Logger LOGGER = LoggerFactory.getLogger(Parser.class);
29-
@Autowired
30-
private JobService jobService;
31-
32-
public void parse(JobSite jobSite) {
33-
34-
Document doc = getDoc(jobSite.getSiteUrl());
35-
Elements jobBlocks = getJobBlocks(jobSite, doc);
36-
37-
for (Element job : jobBlocks) {
38-
Elements titleBlock = getTitleBlock(jobSite, job);
39-
String url = jobSite.getUrlPrefix() + titleBlock.attr("href");
40-
String title = getTitle(titleBlock);
41-
String description = getDescription(jobSite, job);
42-
String company = getCompany(jobSite, job, url);
43-
LocalDateTime date = getDate(jobSite, job, url, titleBlock);
44-
45-
Job parsedJob = new Job(title, description, company, jobSite.getSiteName(), url, date);
46-
jobService.save(parsedJob);
47-
}
48-
LOGGER.info("Parsing of {} completed", jobSite.getSiteName());
49-
}
50-
51-
private Document getDoc(String siteUrl) {
52-
try {
53-
return Jsoup.connect(siteUrl).userAgent("Mozilla").timeout(0).get();
54-
} catch (IOException e) {
55-
LOGGER.error("Connecting to {} failed", siteUrl);
56-
throw new RuntimeException("Connection failed to " + siteUrl);
57-
}
58-
}
59-
60-
private Elements getJobBlocks(JobSite jobSite, Document doc) {
61-
String[] jobBox = jobSite.getJobBox();
62-
if (jobSite instanceof WorkUa) {
63-
return doc.getElementsByAttributeValueStarting(jobBox[0], jobBox[1]);
64-
} else if (jobSite instanceof RabotaUa) {
65-
return getJobBlocksForRabotaUa(doc, jobBox);
66-
} else {
67-
return doc.getElementsByAttributeValue(jobBox[0], jobBox[1]);
68-
}
69-
}
70-
71-
private Elements getJobBlocksForRabotaUa(Document doc, String[] jobBox) {
72-
Elements jobBlocks = new Elements();
73-
for (int i = 1; i < jobBox.length; i++) {
74-
Elements jobElements = doc.getElementsByAttributeValue(jobBox[0], jobBox[i]);
75-
if (jobElements != null && !jobElements.isEmpty()) {
76-
jobBlocks.addAll(jobElements);
77-
}
78-
}
79-
return jobBlocks;
80-
}
81-
82-
private Elements getTitleBlock(JobSite jobSite, Element job) {
83-
String[] titleBox = jobSite.getTitleBox();
84-
if (jobSite instanceof WorkUa) {
85-
return job.getElementsByTag("a");
86-
} else {
87-
return job.getElementsByAttributeValue(titleBox[0], titleBox[1]);
88-
}
89-
}
90-
91-
private String getTitle(Elements titleBlock) {
92-
String title = titleBlock.text();
93-
if (title.endsWith("Горячая")) {
94-
title = title.substring(0, title.length() - "Горячая".length());
95-
}
96-
return title;
97-
}
98-
99-
private String getDescription(JobSite jobSite, Element job) {
100-
String[] descriptionData = jobSite.getDescriptionData();
101-
return job.getElementsByAttributeValue(descriptionData[0], descriptionData[1]).text();
102-
}
103-
104-
private LocalDateTime getDate(JobSite jobSite, Element job, String url, Elements titleBlock) {
105-
String[] dateData = jobSite.getDateData();
106-
String dateLine;
107-
if (jobSite instanceof DouUa) {
108-
Document dateDoc = getDoc(url);
109-
dateLine = dateDoc.getElementsByAttributeValue(dateData[0], dateData[1]).text();
110-
return getDateByLine(jobSite, dateLine);
111-
} else if (jobSite instanceof RabotaUa) {
112-
return getDateForRabotaUa(url);
113-
} else {
114-
if (jobSite instanceof WorkUa) {
115-
dateLine = titleBlock.attr("title");
116-
} else {
117-
dateLine = job.getElementsByAttributeValue(dateData[0], dateData[1]).text();
118-
}
119-
return getDateByLine(jobSite, dateLine);
120-
}
121-
}
122-
123-
private LocalDateTime getDateByLine(JobSite jobSite, String dateLine) {
124-
String[] dateParts;
125-
int year;
126-
int month;
127-
int day;
128-
String split = jobSite.getSplit();
129-
if (jobSite instanceof JobsUa) {
130-
dateLine = dateLine.substring(0, 10);
131-
}
132-
if (jobSite instanceof WorkUa) {
133-
dateLine = dateLine.substring(dateLine.length() - 8);
134-
}
135-
dateParts = dateLine.split(split);
136-
MonthsTools.removeZero(dateParts);
137-
day = Integer.parseInt(dateParts[0]);
138-
if (jobSite instanceof HeadHunterUa) {
139-
year = LocalDate.now(ZoneId.of("Europe/Athens")).getYear();
140-
} else year = Integer.parseInt(dateParts[2]);
141-
if (jobSite instanceof WorkUa) {
142-
year = year + 2000;
143-
}
144-
if (jobSite instanceof DouUa || jobSite instanceof HeadHunterUa) {
145-
month = MonthsTools.MONTHS.get(dateParts[1].toLowerCase());
146-
} else month = Integer.parseInt(dateParts[1]);
147-
return LocalDate.of(year, month, day).atTime(getTime());
148-
}
149-
150-
private LocalDateTime getDateForRabotaUa(String url) {
151-
/*
152-
* There are several problems here.
153-
* First: there are two types of date tags, used on rabota.ua on different pages: "d-date" and "datePosted".
154-
* Second: sometimes date format is dd.mm.yyyy and sometimes — yyyy-mm-dd.
155-
* Third: sometimes there is no date at all.
156-
*/
157-
Document dateDoc = getDoc(url);
158-
String dateLine;
159-
String[] dateParts;
160-
int year;
161-
int month;
162-
int day;
163-
164-
Elements dateElements = dateDoc.getElementsByAttributeValue("id", "d-date");
165-
if (!dateElements.isEmpty()) {
166-
dateLine = dateElements.get(0).getElementsByAttributeValue("class", "d-ph-value").text();
167-
} else {
168-
dateLine = dateDoc.getElementsByAttributeValue("itemprop", "datePosted").text();
169-
if (dateLine.length() == 0) {
170-
//no date at all, sometimes it happens
171-
LocalDateTime ldt = LocalDateTime.now(ZoneId.of("Europe/Athens"));
172-
LOGGER.debug("There was no date on Rabota.ua, return {}", ldt);
173-
return ldt;
174-
}
175-
}
176-
try {
177-
dateParts = dateLine.split("\\.");
178-
MonthsTools.removeZero(dateParts);
179-
year = Integer.parseInt(dateParts[2]);
180-
month = Integer.parseInt(dateParts[1]);
181-
day = Integer.parseInt(dateParts[0]);
182-
183-
} catch (ArrayIndexOutOfBoundsException e) {
184-
185-
dateParts = dateLine.split("-");
186-
MonthsTools.removeZero(dateParts);
187-
year = Integer.parseInt(dateParts[0]);
188-
month = Integer.parseInt(dateParts[1]);
189-
day = Integer.parseInt(dateParts[2]);
190-
}
191-
return LocalDate.of(year, month, day).atTime(getTime());
192-
}
193-
194-
private LocalTime getTime() {
195-
return LocalTime.now(ZoneId.of("Europe/Athens"));
196-
}
197-
198-
private String getCompany(JobSite jobSite, Element job, String url) {
199-
String[] companyData = jobSite.getCompanyData();
200-
if (jobSite instanceof JobsUa || jobSite instanceof WorkUa) {
201-
Document jobDoc = getDoc(url);
202-
Elements companyBlock = jobDoc.getElementsByAttributeValue(companyData[0], companyData[1]);
203-
return companyBlock.get(0).getElementsByTag("a").first().text();
204-
} else {
205-
return job.getElementsByAttributeValue(companyData[0], companyData[1]).text();
206-
}
207-
}
8+
public interface Parser {
9+
void parse(JobSite jobSite);
20810
}
Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,29 @@
11
package com.olegshan.parser;
22

3-
import com.olegshan.sites.*;
3+
import com.olegshan.sites.JobSite;
44
import org.springframework.beans.factory.annotation.Autowired;
55
import org.springframework.scheduling.annotation.Scheduled;
66
import org.springframework.stereotype.Component;
77

8+
import java.util.List;
9+
810
/**
911
* Created by olegshan on 06.10.2016.
1012
*/
1113
@Component
1214
public class Performer {
15+
private List<JobSite> sites;
16+
private Parser parser;
1317

1418
@Autowired
15-
private DouUa douUa;
16-
@Autowired
17-
private RabotaUa rabotaUa;
18-
@Autowired
19-
private JobsUa jobsUa;
20-
@Autowired
21-
private WorkUa workUa;
22-
@Autowired
23-
private HeadHunterUa headHunterUa;
19+
public Performer(List<JobSite> sites, Parser parser) {
20+
this.sites = sites;
21+
this.parser = parser;
22+
}
2423

2524
@Scheduled(cron = "0 1 7-23 * * *", zone = "Europe/Athens")
2625
public void perform() {
27-
douUa.parse();
28-
headHunterUa.parse();
29-
jobsUa.parse();
30-
rabotaUa.parse();
31-
workUa.parse();
26+
for (JobSite jobSite : sites)
27+
parser.parse(jobSite);
3228
}
3329
}

0 commit comments

Comments
 (0)